Section 4
Developer Toolkit
Production-ready API guides, library references, and copy-paste code snippets for shipping AI/ML applications.
4.1
OpenAI API
GPT-4o integration with function calling and streaming.
🐍openai_integration.py
1from openai import OpenAI
2
3client = OpenAI(api_key="your-key")
4
5# Chat completion with function calling
6tools = [{
7 "type": "function",
8 "function": {
9 "name": "search_database",
10 "description": "Search the DoD financial database",
11 "parameters": {
12 "type": "object",
13 "properties": {
14 "query": {"type": "string"},
15 "fiscal_year": {"type": "integer"}
16 },
17 "required": ["query"]
18 }
19 }
20}]
21
22response = client.chat.completions.create(
23 model="gpt-4o",
24 messages=[{"role": "user", "content": "Find FY2025 Army modernization data"}],
25 tools=tools,
26 tool_choice="auto",
27 temperature=0.3
28)
29
30# Streaming
31with client.chat.completions.stream(
32 model="gpt-4o",
33 messages=[{"role": "user", "content": "Explain RAG in 3 sentences"}]
34) as stream:
35 for chunk in stream.text_stream:
36 print(chunk, end="", flush=True)4.2
scikit-learn Production Pipeline
End-to-end ML pipeline with preprocessing, cross-validation, and model persistence.
🐍ml_pipeline.py
1from sklearn.pipeline import Pipeline
2from sklearn.preprocessing import StandardScaler, LabelEncoder
3from sklearn.compose import ColumnTransformer
4from sklearn.impute import SimpleImputer
5from sklearn.ensemble import GradientBoostingClassifier
6from sklearn.model_selection import GridSearchCV
7import joblib
8
9# Production ML Pipeline
10numeric_features = ['transaction_amount', 'days_outstanding', 'obligation_rate']
11categorical_features = ['appropriation_type', 'vendor_category', 'service_branch']
12
13numeric_transformer = Pipeline([
14 ('imputer', SimpleImputer(strategy='median')),
15 ('scaler', StandardScaler()),
16])
17
18preprocessor = ColumnTransformer([
19 ('num', numeric_transformer, numeric_features),
20 ('cat', SimpleImputer(strategy='most_frequent'), categorical_features),
21])
22
23pipeline = Pipeline([
24 ('preprocessor', preprocessor),
25 ('classifier', GradientBoostingClassifier(random_state=42))
26])
27
28# Hyperparameter tuning
29param_grid = {
30 'classifier__n_estimators': [100, 200],
31 'classifier__max_depth': [3, 5, 7],
32 'classifier__learning_rate': [0.05, 0.1, 0.2],
33}
34
35search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
36search.fit(X_train, y_train)
37
38print(f"Best params: {search.best_params_}")
39print(f"Best F1: {search.best_score_:.4f}")
40
41# Save the model
42joblib.dump(search.best_estimator_, 'audit_risk_model_v1.pkl')Production Tip
Always use Pipeline to prevent data leakage during cross-validation. The preprocessor should be fit only on training data — Pipeline handles this automatically.
4.3
LangChain Guide
Framework for building LLM-powered applications with chains, agents, and memory.
LangChainPopular
Most mature, extensive integrations, large community. Best for complex chains.
LlamaIndexRAG-first
Optimized for RAG and document indexing. Better structured data handling.
AutoGenAgents
Multi-agent conversations. Microsoft's framework for agent orchestration.
4.4
Vector Databases
Semantic search infrastructure for RAG systems.
| DB | Type | Scale | Best For | Cost |
|---|---|---|---|---|
| Chroma | Local/Cloud | Small-Med | Development, prototypes | Free |
| Pinecone | Managed | Large | Production, real-time | Paid |
| Weaviate | Self/Cloud | Large | GraphQL + vectors | Open source |
| pgvector | PostgreSQL | Med-Large | Existing Postgres users | Free |
| Qdrant | Self/Cloud | Large | High performance Rust | Open source |
🐍vectordb_chroma.py
1# Chroma (local) - great for development
2import chromadb
3from chromadb.utils import embedding_functions
4
5# Initialize local DB
6client = chromadb.PersistentClient(path="./chroma_db")
7
8# Use OpenAI or custom embeddings
9openai_ef = embedding_functions.OpenAIEmbeddingFunction(
10 api_key="your-key",
11 model_name="text-embedding-3-small"
12)
13
14collection = client.get_or_create_collection(
15 name="policy_documents",
16 embedding_function=openai_ef
17)
18
19# Add documents
20collection.add(
21 documents=["OMB Circular A-11 requires budget justification...",
22 "FIAR requires detailed transaction-level data..."],
23 metadatas=[{"source": "OMB A-11", "year": 2024},
24 {"source": "FIAR", "year": 2024}],
25 ids=["omb-a11-001", "fiar-001"]
26)
27
28# Semantic search
29results = collection.query(
30 query_texts=["budget submission requirements"],
31 n_results=3
32)
33
34for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
35 print(f"[{meta['source']}] {doc[:100]}...")4.5
Production Code Snippets
Copy-paste ready patterns for common ML engineering tasks.
🐍streaming_api.py
1# Streaming responses for better UX
2from anthropic import Anthropic
3
4client = Anthropic()
5
6async def stream_analysis(document: str):
7 """Stream AI analysis with real-time output"""
8 with client.messages.stream(
9 model="claude-3-5-sonnet-20241022",
10 max_tokens=2000,
11 messages=[{
12 "role": "user",
13 "content": f"Analyze this document: {document}"
14 }]
15 ) as stream:
16 for text in stream.text_stream:
17 yield text # Send to frontend via SSE
18
19# FastAPI streaming endpoint
20from fastapi import FastAPI
21from fastapi.responses import StreamingResponse
22
23app = FastAPI()
24
25@app.post("/analyze-stream")
26async def analyze_stream(request: dict):
27 async def generate():
28 async for chunk in stream_analysis(request["document"]):
29 yield f"data: {chunk}\n\n"
30
31 return StreamingResponse(generate(), media_type="text/event-stream")