LangGraph Sample Project

author:BZdate:2025-10-02

LangGraph Sample Project

Objective

Independent deployable services
- Each agent can scale horizontally (e.g., analysis_service replicas)
- You can version and deploy agents independently
Schema isolation
- Each service defines its own Pydantic input/output
- Supervisor does schema translation
Resilience
- Supervisor can retry subgraph calls, add timeout handling
Observability
- You can trace inter-agent calls via httpx middleware or OpenTelemetry
Extensible
- Just add new agents (summarizer_service, retriever_service, etc.)
- Supervisor graph can grow dynamically without coupling

Project Structure

🎯 What’s Included:

3 Independent Services:
- Research Service (Port 8081) - Handles research queries with validation, planning, gathering, and summarization
- Analysis Service (Port 8082) - Extracts insights, generates recommendations, and creates analysis reports
- Supervisor Service (Port 8080) - Orchestrates the entire workflow via REST API calls
Key Features:
- ✅ Independent LangGraph workflows in each service
- ✅ Shared error handler with per-node retry tracking
- ✅ REST API communication between services
- ✅ Parallel node support - no state conflicts
- ✅ Docker deployment ready with docker-compose
- ✅ Health checks for monitoring
- ✅ Comprehensive error handling with automatic retries
- ✅ Test scripts for validation

1
agentic_system/
2
├── shared/
3
│   └── error_handler.py          # Shared error handling logic
4
│
5
├── research_service/
6
│   ├── main.py                    # FastAPI app
7
│   ├── schema.py                  # Pydantic models
8
│   ├── graph.py                   # LangGraph workflow
9
│   ├── requirements.txt
10
│   └── Dockerfile
11
│
12
├── analysis_service/
13
│   ├── main.py                    # FastAPI app
14
│   ├── schema.py                  # Pydantic models
15
│   ├── graph.py                   # LangGraph workflow
16
│   ├── requirements.txt
17
│   └── Dockerfile
18
│
19
├── supervisor_service/
20
│   ├── main.py                    # FastAPI app
21
│   ├── schema.py                  # Pydantic models
22
│   ├── graph.py                   # LangGraph workflow
23
│   ├── requirements.txt
24
│   └── Dockerfile
25
│
26
├── docker-compose.yml
27
├── run_services.sh
28
└── test_system.py

Services

??? “Research Service”

1
```sh
2
  ├── research_service/
3
  ├── main.py                    # FastAPI app
4
  ├── schema.py                  # Pydantic models
5
  ├── graph.py                   # LangGraph workflow
6
```
7

8
??? "codes: research_service/"
9
    ```python linenums="1" title="schema.py"
10
      from typing import List, Optionalfrom pydantic import BaseModel, Field
11
      from typing import Optional
12

13

14
      class ResearchState(BaseModel):
15
          query: str = ""
16
          research_plan: str = ""
17
          search_results: list[str] = Field(default_factory=list)
18
          summary: str = ""
19

20
          # Error handling
21
          error_messages: list[str] = Field(default_factory=list)
22
          retry_count: int = 0
23
          max_retries: int = 3
24
          failed_nodes: dict[str, int] = Field(default_factory=dict)
25

26
          class Config:
27
              arbitrary_types_allowed = True
28

29

30
      class ResearchRequest(BaseModel):
31
          query: str
32
          max_retries: int = 3
33

34

35
      class ResearchResponse(BaseModel):
36
          query: str
37
          research_plan: str
38
          search_results: list[str]
39
          summary: str
40
          error_messages: list[str]
41
          failed_nodes: dict[str, int]
42
          success: bool
43
    ```
44

45
    ```python linenums="1" title="graph.py"
46
      import logging
47
      from typing import Optional
48

49
      from langgraph.graph import StateGraph, END
50
      from langchain_ollama import ChatOllama
51
      from langchain_core.messages import HumanMessage, SystemMessage
52

53
      from agentic_app.research_service.schema import ResearchState
54
      from agentic_app.shared.error_handler import handle_node_errors, create_universal_router
55

56
      logger = logging.getLogger(__name__)
57

58

59
      class ResearchNodes:
60
          def __init__(self, llm: Optional[ChatOllama] = None):
61
              self.llm = llm or ChatOllama(model="gpt-oss", temperature=0)
62

63
          @handle_node_errors("validate_query", "Failed to validate query")
64
          def validate_query(self, state: ResearchState) -> dict:
65
              logger.info(f"Validating query: {state.query}")
66

67
              if not state.query or len(state.query.strip()) < 5:
68
                  raise ValueError("Query must be at least 5 characters long")
69

70
              return {}
71

72
          @handle_node_errors("create_plan", "Failed to create research plan")
73
          async def create_plan(self, state: ResearchState) -> dict:
74
              logger.info("Creating research plan")
75

76
              messages = [
77
                  SystemMessage(content="Create a brief 3-step research plan."),
78
                  HumanMessage(content=f"Create a research plan for: {state.query}")
79
              ]
80

81
              response = await self.llm.ainvoke(messages)
82

83
              return {
84
                  "research_plan": response.content
85
              }
86

87
          @handle_node_errors("gather_info", "Failed to gather information")
88
          async def gather_info(self, state: ResearchState) -> dict:
89
              logger.info("Gathering information")
90

91
              # Simulate research gathering
92
              search_results = [
93
                  f"Finding 1 about {state.query}",
94
                  f"Finding 2 about {state.query}",
95
                  f"Finding 3 about {state.query}",
96
              ]
97

98
              return {
99
                  "search_results": search_results
100
              }
101

102
          @handle_node_errors("summarize", "Failed to summarize")
103
          async def summarize(self, state: ResearchState) -> dict:
104
              logger.info("Summarizing findings")
105

106
              findings = "\n".join(f"- {r}" for r in state.search_results)
107

108
              messages = [
109
                  SystemMessage(content="Summarize these research findings concisely."),
110
                  HumanMessage(content=f"Plan: {state.research_plan}\n\nFindings:\n{findings}")
111
              ]
112

113
              response = await self.llm.ainvoke(messages)
114

115
              return {
116
                  "summary": response.content
117
              }
118

119

120
      def create_research_graph():
121
          nodes = ResearchNodes()
122
          workflow = StateGraph(ResearchState)
123

124
          workflow.add_node("validate_query", nodes.validate_query)
125
          workflow.add_node("create_plan", nodes.create_plan)
126
          workflow.add_node("gather_info", nodes.gather_info)
127
          workflow.add_node("summarize", nodes.summarize)
128

129
          workflow.set_entry_point("validate_query")
130

131
          workflow.add_conditional_edges(
132
              "validate_query",
133
              create_universal_router(next_node="create_plan", node_name="validate_query")
134
          )
135
          workflow.add_conditional_edges(
136
              "create_plan",
137
              create_universal_router(next_node="gather_info", node_name="create_plan")
138
          )
139
          workflow.add_conditional_edges(
140
              "gather_info",
141
              create_universal_router(next_node="summarize", node_name="gather_info")
142
          )
143
          workflow.add_conditional_edges(
144
              "summarize",
145
              create_universal_router(next_node=END, node_name="summarize")
146
          )
147

148
          return workflow.compile()
149

150

151
    ```
152

153

154
    ```python linenums="1" title="main.py"
155
        from fastapi import FastAPI, HTTPException
156
        from fastapi.middleware.cors import CORSMiddleware
157
        import logging
158

159
        from agentic_app.analysis_service.graph import create_analysis_graph
160
        from agentic_app.analysis_service.schema import AnalysisState, AnalysisRequest, AnalysisResponse
161

162
        logging.basicConfig(level=logging.INFO)
163
        logger = logging.getLogger(__name__)
164

165
        app = FastAPI(title="Analysis Service", version="1.0.0")
166

167
        app.add_middleware(
168
            CORSMiddleware,
169
            allow_origins=["*"],
170
            allow_credentials=True,
171
            allow_methods=["*"],
172
            allow_headers=["*"],
173
        )
174

175
        analysis_graph = create_analysis_graph()
176

177

178
        @app.post("/analyze", response_model=AnalysisResponse)
179
        async def analyze_research(request: AnalysisRequest):
180
            """Analyze research summary and generate insights"""
181
            try:
182
                logger.info("Received analysis request")
183

184
                initial_state = AnalysisState(
185
                    research_summary=request.research_summary,
186
                    max_retries=request.max_retries
187
                )
188

189
                final_state = await analysis_graph.ainvoke(initial_state)
190

191
                return AnalysisResponse(
192
                    insights=final_state.get("insights", []),
193
                    recommendations=final_state.get("recommendations", []),
194
                    final_analysis=final_state.get("final_analysis", ""),
195
                    error_messages=final_state.get("error_messages", []),
196
                    failed_nodes=final_state.get("failed_nodes", {}),
197
                    success=len(final_state.get("error_messages", [])) == 0
198
                )
199

200
            except Exception as e:
201
                logger.error(f"Analysis failed: {str(e)}")
202
                raise HTTPException(status_code=500, detail=str(e))
203

204

205
        @app.get("/health")
206
        async def health():
207
            return {"status": "healthy", "service": "analysis"}
208

209
        if __name__ == '__main__':
210
            import uvicorn
211
            uvicorn.run(app, host="0.0.0.0", port=8082)
212
    ```

??? “Analysis Service”

1
```sh
2
 ├── analysis_service/
3
 ├── main.py                    # FastAPI app
4
 ├── schema.py                  # Pydantic models
5
 ├── graph.py
6
```
7

8
??? "codes: analysis_service/"
9

10
    ```python linenums="1" title="schema.py"
11
      from pydantic import BaseModel, Field, ConfigDict
12

13
      class AnalysisState(BaseModel):
14
          model_config = ConfigDict(arbitrary_types_allowed=True)
15
          research_summary: str = ""
16
          insights: list[str] = Field(default_factory=list)
17
          recommendations: list[str] = Field(default_factory=list)
18
          final_analysis: str = ""
19

20
          # Error handling
21
          error_messages: list[str] = Field(default_factory=list)
22
          retry_count: int = 0
23
          max_retries: int = 3
24
          failed_nodes: dict[str, int] = Field(default_factory=dict)
25

26

27
      class AnalysisRequest(BaseModel):
28
          research_summary: str
29
          max_retries: int = 3
30

31

32
      class AnalysisResponse(BaseModel):
33
          insights: list[str]
34
          recommendations: list[str]
35
          final_analysis: str
36
          error_messages: list[str]
37
          failed_nodes: dict[str, int]
38
          success: bool
39
    ```
40

41
    ```python linenums="1" title="graph.py"
42

43
        import logging
44
        from typing import Optional
45

46
        from langgraph.graph import StateGraph, END
47
        from langchain_ollama import ChatOllama
48
        from langchain_core.messages import HumanMessage, SystemMessage
49

50
        from agentic_app.analysis_service.schema import AnalysisState
51
        from agentic_app.shared.error_handler import handle_node_errors, create_universal_router
52

53
        logger = logging.getLogger(__name__)
54

55

56
        class AnalysisNodes:
57
            def __init__(self, llm: Optional[ChatOllama] = None):
58
                self.llm = llm or ChatOllama(model="gpt-oss", temperature=0)
59

60
            @handle_node_errors("extract_insights", "Failed to extract insights")
61
            async def extract_insights(self, state: AnalysisState) -> dict:
62
                logger.info("Extracting insights")
63

64
                messages = [
65
                    SystemMessage(content="Extract 3 key insights from this research."),
66
                    HumanMessage(content=state.research_summary)
67
                ]
68

69
                response = await self.llm.ainvoke(messages)
70

71
                # Parse insights (simplified)
72
                insights = [line.strip() for line in response.content.split('\n') if line.strip()][:3]
73

74
                return {
75
                    "insights": insights
76
                }
77

78
            @handle_node_errors("generate_recommendations", "Failed to generate recommendations")
79
            async def generate_recommendations(self, state: AnalysisState) -> dict:
80
                logger.info("Generating recommendations")
81

82
                insights_text = "\n".join(f"- {i}" for i in state.insights)
83

84
                messages = [
85
                    SystemMessage(content="Generate 3 actionable recommendations based on these insights."),
86
                    HumanMessage(content=insights_text)
87
                ]
88

89
                response = await self.llm.ainvoke(messages)
90

91
                recommendations = [line.strip() for line in response.content.split('\n') if line.strip()][:3]
92

93
                return {
94
                    "recommendations": recommendations
95
                }
96

97
            @handle_node_errors("create_analysis", "Failed to create final analysis")
98
            async def create_analysis(self, state: AnalysisState) -> dict:
99
                logger.info("Creating final analysis")
100

101
                messages = [
102
                    SystemMessage(content="Create a concise final analysis report."),
103
                    HumanMessage(
104
                        content=f"Summary: {state.research_summary}\n\nInsights: {state.insights}\n\nRecommendations: {state.recommendations}")
105
                ]
106

107
                response = await self.llm.ainvoke(messages)
108

109
                return {
110
                    "final_analysis": response.content
111
                }
112

113

114
        def create_analysis_graph():
115
            nodes = AnalysisNodes()
116
            workflow = StateGraph(AnalysisState)
117

118
            workflow.add_node("extract_insights", nodes.extract_insights)
119
            workflow.add_node("generate_recommendations", nodes.generate_recommendations)
120
            workflow.add_node("create_analysis", nodes.create_analysis)
121

122
            workflow.set_entry_point("extract_insights")
123

124
            workflow.add_conditional_edges(
125
                "extract_insights",
126
                create_universal_router(next_node="generate_recommendations", node_name="extract_insights")
127
            )
128
            workflow.add_conditional_edges(
129
                "generate_recommendations",
130
                create_universal_router(next_node="create_analysis", node_name="generate_recommendations")
131
            )
132
            workflow.add_conditional_edges(
133
                "create_analysis",
134
                create_universal_router(next_node=END, node_name="create_analysis")
135
            )
136

137
            return workflow.compile()
138
    ```
139

140
    ```python linenums="1" title="main.py"
141

142
        from fastapi import FastAPI, HTTPException
143
        from fastapi.middleware.cors import CORSMiddleware
144
        import logging
145

146
        from agentic_app.analysis_service.graph import create_analysis_graph
147
        from agentic_app.analysis_service.schema import AnalysisState, AnalysisRequest, AnalysisResponse
148

149
        logging.basicConfig(level=logging.INFO)
150
        logger = logging.getLogger(__name__)
151

152
        app = FastAPI(title="Analysis Service", version="1.0.0")
153

154
        app.add_middleware(
155
            CORSMiddleware,
156
            allow_origins=["*"],
157
            allow_credentials=True,
158
            allow_methods=["*"],
159
            allow_headers=["*"],
160
        )
161

162
        analysis_graph = create_analysis_graph()
163

164

165
        @app.post("/analyze", response_model=AnalysisResponse)
166
        async def analyze_research(request: AnalysisRequest):
167
            """Analyze research summary and generate insights"""
168
            try:
169
                logger.info("Received analysis request")
170

171
                initial_state = AnalysisState(
172
                    research_summary=request.research_summary,
173
                    max_retries=request.max_retries
174
                )
175

176
                final_state = await analysis_graph.ainvoke(initial_state)
177

178
                return AnalysisResponse(
179
                    insights=final_state.get("insights", []),
180
                    recommendations=final_state.get("recommendations", []),
181
                    final_analysis=final_state.get("final_analysis", ""),
182
                    error_messages=final_state.get("error_messages", []),
183
                    failed_nodes=final_state.get("failed_nodes", {}),
184
                    success=len(final_state.get("error_messages", [])) == 0
185
                )
186

187
            except Exception as e:
188
                logger.error(f"Analysis failed: {str(e)}")
189
                raise HTTPException(status_code=500, detail=str(e))
190

191

192
        @app.get("/health")
193
        async def health():
194
            return {"status": "healthy", "service": "analysis"}
195

196
        if __name__ == '__main__':
197
            import uvicorn
198
            uvicorn.run(app, host="0.0.0.0", port=8082)
199
    ```

??? “Supervisor Service”

1
```sh
2
 ├── supervisor_service/
3
 ├── main.py                    # FastAPI app
4
 ├── schema.py                  # Pydantic models
5
 ├── graph.py                   # LangGraph workflow
6
```
7

8
??? "codes: analysis_service/"
9

10
    ```python linenums="1" title="schema.py"
11

12
      from pydantic import BaseModel, Field
13
      from typing import Optional
14

15

16
      class SupervisorState(BaseModel):
17
          original_query: str = ""
18
          research_result: dict = Field(default_factory=dict)
19
          analysis_result: dict = Field(default_factory=dict)
20
          final_report: str = ""
21

22
          # Error handling
23
          error_messages: list[str] = Field(default_factory=list)
24
          retry_count: int = 0
25
          max_retries: int = 3
26
          failed_nodes: dict[str, int] = Field(default_factory=dict)
27

28
          class Config:
29
              arbitrary_types_allowed = True
30

31

32
      class SupervisorRequest(BaseModel):
33
          query: str
34
          max_retries: int = 3
35
          research_service_url: str = "http://localhost:8081"
36
          analysis_service_url: str = "http://localhost:8082"
37

38

39
      class SupervisorResponse(BaseModel):
40
          query: str
41
          research_summary: str
42
          analysis_report: str
43
          final_report: str
44
          error_messages: list[str]
45
          failed_nodes: dict[str, int]
46
          success: bool
47

48
    ```
49

50
    ```python linenums="1" title="graph.py"
51

52
        import logging
53
        from typing import Optional
54

55
        import httpx
56
        from langgraph.graph import StateGraph, END
57
        from langchain_ollama import ChatOllama
58
        from langchain_core.messages import HumanMessage, SystemMessage
59

60
        from agentic_app.shared.error_handler import handle_node_errors, create_universal_router
61
        from agentic_app.supervisor_service.schema import SupervisorState
62

63
        logger = logging.getLogger(__name__)
64

65

66
        class SupervisorNodes:
67
            def __init__(self, research_url: str, analysis_url: str, llm: Optional[ChatOllama] = None):
68
                self.research_url = research_url
69
                self.analysis_url = analysis_url
70
                self.llm = llm or ChatOllama(model="gpt-oss", temperature=0)
71

72
            @handle_node_errors("call_research", "Failed to call research service")
73
            async def call_research(self, state: SupervisorState) -> dict:
74
                logger.info(f"Calling research service at {self.research_url}")
75

76
                async with httpx.AsyncClient(timeout=300.0) as client:
77
                    response = await client.post(
78
                        f"{self.research_url}/research",
79
                        json={"query": state.original_query, "max_retries": state.max_retries}
80
                    )
81
                    response.raise_for_status()
82
                    result = response.json()
83

84
                if not result.get("success"):
85
                    raise Exception(f"Research service failed: {result.get('error_messages')}")
86

87
                return {
88
                    "research_result": result
89
                }
90

91
            @handle_node_errors("call_analysis", "Failed to call analysis service")
92
            async def call_analysis(self, state: SupervisorState) -> dict:
93
                logger.info(f"Calling analysis service at {self.analysis_url}")
94

95
                research_summary = state.research_result.get("summary", "")
96

97
                async with httpx.AsyncClient(timeout=300.0) as client:
98
                    response = await client.post(
99
                        f"{self.analysis_url}/analyze",
100
                        json={"research_summary": research_summary, "max_retries": state.max_retries}
101
                    )
102
                    response.raise_for_status()
103
                    result = response.json()
104

105
                if not result.get("success"):
106
                    raise Exception(f"Analysis service failed: {result.get('error_messages')}")
107

108
                return {
109
                    "analysis_result": result
110
                }
111

112
            @handle_node_errors("generate_final_report", "Failed to generate final report")
113
            async def generate_final_report(self, state: SupervisorState) -> dict:
114
                logger.info("Generating final report")
115

116
                messages = [
117
                    SystemMessage(content="Create a comprehensive final report combining research and analysis."),
118
                    HumanMessage(
119
                        content=f"Query: {state.original_query}\n\nResearch: {state.research_result.get('summary')}\n\nAnalysis: {state.analysis_result.get('final_analysis')}")
120
                ]
121

122
                response = await self.llm.ainvoke(messages)
123

124
                return {
125
                    "final_report": response.content
126
                }
127

128

129
        def create_supervisor_graph(research_url: str, analysis_url: str):
130
            nodes = SupervisorNodes(research_url, analysis_url)
131
            workflow = StateGraph(SupervisorState)
132

133
            workflow.add_node("call_research", nodes.call_research)
134
            workflow.add_node("call_analysis", nodes.call_analysis)
135
            workflow.add_node("generate_final_report", nodes.generate_final_report)
136

137
            workflow.set_entry_point("call_research")
138

139
            workflow.add_conditional_edges(
140
                "call_research",
141
                create_universal_router(next_node="call_analysis", node_name="call_research")
142
            )
143
            workflow.add_conditional_edges(
144
                "call_analysis",
145
                create_universal_router(next_node="generate_final_report", node_name="call_analysis")
146
            )
147
            workflow.add_conditional_edges(
148
                "generate_final_report",
149
                create_universal_router(next_node=END, node_name="generate_final_report")
150
            )
151

152
            return workflow.compile()
153
    ```
154

155
    ```python linenums="1" title="main.py"
156

157
        from fastapi import FastAPI, HTTPException
158
        from fastapi.middleware.cors import CORSMiddleware
159
        import logging
160

161
        from agentic_app.research_service.graph import create_research_graph
162
        from agentic_app.research_service.schema import ResearchResponse, ResearchRequest, ResearchState
163

164
        logging.basicConfig(level=logging.INFO)
165
        logger = logging.getLogger(__name__)
166

167
        app = FastAPI(title="Research Service", version="1.0.0")
168

169
        app.add_middleware(
170
            CORSMiddleware,
171
            allow_origins=["*"],
172
            allow_credentials=True,
173
            allow_methods=["*"],
174
            allow_headers=["*"],
175
        )
176

177
        # Initialize graph
178
        research_graph = create_research_graph()
179

180

181
        @app.post("/research", response_model=ResearchResponse)
182
        async def conduct_research(request: ResearchRequest):
183
            """Conduct research on a given query"""
184
            try:
185
                logger.info(f"Received research request: {request.query}")
186

187
                initial_state = ResearchState(
188
                    query=request.query,
189
                    max_retries=request.max_retries
190
                )
191

192
                final_state = await research_graph.ainvoke(initial_state)
193

194
                return ResearchResponse(
195
                    query=final_state.get("query", ""),
196
                    research_plan=final_state.get("research_plan", ""),
197
                    search_results=final_state.get("search_results", []),
198
                    summary=final_state.get("summary", ""),
199
                    error_messages=final_state.get("error_messages", []),
200
                    failed_nodes=final_state.get("failed_nodes", {}),
201
                    success=len(final_state.get("error_messages", [])) == 0
202
                )
203

204
            except Exception as e:
205
                logger.error(f"Research failed: {str(e)}")
206
                raise HTTPException(status_code=500, detail=str(e))
207

208

209
        @app.get("/health")
210
        async def health():
211
            return {"status": "healthy", "service": "research"}
212

213

214
        if __name__ == '__main__':
215
            import uvicorn
216
            uvicorn.run(app, host="0.0.0.0", port=8081)
217
    ```

??? “Others”

1
```python linenums="1" title="error_handler.py"
2
import asyncio
3
import logging
4
from functools import wraps
5
from typing import Protocol, Optional, TypeVar, Any, Callable
6

7
logger = logging.getLogger(__name__)
8

9

10
class ErrorState(Protocol):
11
    error_messages: list[str]
12
    retry_count: int
13
    max_retries: int
14
    failed_nodes: dict[str, int]
15

16

17
StateType = TypeVar('StateType', bound=ErrorState)
18

19

20
class ErrorHandler:
21
    @staticmethod
22
    def handle_error(state: StateType, error: Exception, node_name: str, custom_message: Optional[str] = None) -> dict:
23
        error_msg = custom_message or f"Error in {node_name}: {str(error)}"
24
        logger.error(f"Node '{node_name}' failed: {str(error)}")
25

26
        failed_nodes = dict(
27
            getattr(state, 'failed_nodes', {}) if hasattr(state, 'failed_nodes') else state.get('failed_nodes', {}))
28
        node_retry_count = failed_nodes.get(node_name, 0) + 1
29
        failed_nodes[node_name] = node_retry_count
30

31
        retry_count = getattr(state, 'retry_count', 0) if hasattr(state, 'retry_count') else state.get('retry_count', 0)
32

33
        return {
34
            "error_messages": [f"{node_name}: {error_msg}"],
35
            "retry_count": retry_count + 1,
36
            "failed_nodes": failed_nodes
37
        }
38

39
    @staticmethod
40
    def should_retry(state: StateType, node_name: Optional[str] = None) -> bool:
41
        if node_name:
42
            failed_nodes = getattr(state, 'failed_nodes', {}) if hasattr(state, 'failed_nodes') else state.get(
43
                'failed_nodes', {})
44
            node_retry_count = failed_nodes.get(node_name, 0)
45
            max_retries = getattr(state, 'max_retries', 3) if hasattr(state, 'max_retries') else state.get(
46
                'max_retries', 3)
47
            error_messages = getattr(state, 'error_messages', []) if hasattr(state, 'error_messages') else state.get(
48
                'error_messages', [])
49
            has_node_error = any(node_name in msg for msg in error_messages)
50
            return node_retry_count < max_retries and has_node_error
51
        else:
52
            retry_count = getattr(state, 'retry_count', 0) if hasattr(state, 'retry_count') else state.get(
53
                'retry_count', 0)
54
            max_retries = getattr(state, 'max_retries', 3) if hasattr(state, 'max_retries') else state.get(
55
                'max_retries', 3)
56
            error_messages = getattr(state, 'error_messages', []) if hasattr(state, 'error_messages') else state.get(
57
                'error_messages', [])
58
            return retry_count < max_retries and len(error_messages) > 0
59

60
    @staticmethod
61
    def clear_errors(state: StateType) -> dict[str, Any]:
62
        return {
63
            "error_messages": [],
64
            "retry_count": 0,
65
        }
66

67

68
def handle_node_errors(node_name: str, custom_message: Optional[str] = None):
69
    def decorator(func: Callable) -> Callable:
70
        @wraps(func)
71
        async def async_wrapper(self, state: StateType) -> dict[str, Any]:
72
            try:
73
                result = await func(self, state)
74
                if result is None:
75
                    result = {}
76
                result.update(ErrorHandler.clear_errors(state))
77
                return result
78
            except Exception as e:
79
                logger.exception(f"Error in async node '{node_name}'")
80
                return ErrorHandler.handle_error(state, e, node_name, custom_message)
81

82
        @wraps(func)
83
        def sync_wrapper(self, state: StateType) -> dict[str, Any]:
84
            try:
85
                result = func(self, state)
86
                if result is None:
87
                    result = {}
88
                result.update(ErrorHandler.clear_errors(state))
89
                return result
90
            except Exception as e:
91
                logger.exception(f"Error in sync node '{node_name}'")
92
                return ErrorHandler.handle_error(state, e, node_name, custom_message)
93

94
        if asyncio.iscoroutinefunction(func):
95
            return async_wrapper
96
        else:
97
            return sync_wrapper
98

99
    return decorator
100

101

102
def create_universal_router(next_node: str, end_node: str = "END", node_name: Optional[str] = None):
103
    def router(state) -> str:
104
        if isinstance(state, dict):
105
            error_messages = state.get('error_messages', [])
106
            max_retries = state.get('max_retries', 3)
107
            failed_nodes = state.get('failed_nodes', {})
108
        else:
109
            error_messages = state.error_messages
110
            max_retries = state.max_retries
111
            failed_nodes = state.failed_nodes
112

113
        if len(error_messages) > 0 and node_name:
114
            has_node_error = any(node_name in msg for msg in error_messages)
115

116
            if has_node_error:
117
                node_retry_count = failed_nodes.get(node_name, 0)
118
                if node_retry_count < max_retries:
119
                    logger.info(f"Retrying {node_name}, attempt {node_retry_count}/{max_retries}")
120
                    return node_name
121
                else:
122
                    logger.error(f"Max retries reached for {node_name}, ending execution")
123
                    return end_node
124

125
        return next_node
126

127
    return router
128

129
```
130

131

132
```sh linenums="1" title="run_services.sh"
133
# Terminal 1 - Research Service
134
cd research_service
135
uvicorn main:app --reload --port 8001
136

137
# Terminal 2 - Analysis Service
138
cd analysis_service
139
uvicorn main:app --reload --port 8002
140

141
# Terminal 3 - Supervisor Service
142
cd supervisor_service
143
uvicorn main:app --reload --port 8000
144
```

Codes Explanation

??? “Full Code(old)”

1
??? graph
2
    ```mermaid
3
      graph TD
4
        Start([Start]) --> ValidateQuery[Validate Query]
5

6
        ValidateQuery -->|Success| CreatePlan[Create Research Plan]
7
        ValidateQuery -->|Error & Retries| ValidateQuery
8
        ValidateQuery -->|Error & Max Retries| End([End])
9

10
        CreatePlan -->|Success| GatherInfo[Gather Information]
11
        CreatePlan -->|Error & Retries| CreatePlan
12
        CreatePlan -->|Error & Max Retries| End
13

14
        GatherInfo -->|Success| Synthesize[Synthesize Findings]
15
        GatherInfo -->|Error & Retries| GatherInfo
16
        GatherInfo -->|Error & Max Retries| End
17

18
        Synthesize -->|Success| GenerateReport[Generate Report]
19
        Synthesize -->|Error & Retries| Synthesize
20
        Synthesize -->|Error & Max Retries| End
21

22
        GenerateReport -->|Success| End
23
        GenerateReport -->|Error & Retries| GenerateReport
24
        GenerateReport -->|Error & Max Retries| End
25

26
        style ValidateQuery fill:#e1f5ff
27
        style CreatePlan fill:#e1f5ff
28
        style GatherInfo fill:#e1f5ff
29
        style Synthesize fill:#e1f5ff
30
        style GenerateReport fill:#e1f5ff
31
        style Start fill:#d4edda
32
        style End fill:#f8d7da
33
    ```
34

35
```python linenums="1"
36
import asyncio
37
import logging
38
from functools import wraps
39
from typing import Protocol, Optional, TypeVar, Any, Callable, Annotated
40
from langgraph.graph import StateGraph, END
41
from langchain_ollama import ChatOllama
42
from langchain_core.messages import HumanMessage, SystemMessage
43
from langgraph.graph.state import CompiledStateGraph
44
from pydantic import BaseModel, Field
45

46
logging.basicConfig(level=logging.INFO)
47
logger = logging.getLogger(__name__)
48

49

50
# ============================================================================
51
# ERROR HANDLER (from your code)
52
# ============================================================================
53

54
class ErrorState(Protocol):
55
    error_messages: list[str]
56
    retry_count: int
57
    max_retries: int
58
    last_failed_node: Optional[str]
59
    current_node: Optional[str]
60

61

62
StateType = TypeVar('StateType', bound=ErrorState)
63

64

65
class ErrorHandler:
66
    @staticmethod
67
    def handle_error(state: StateType, error: Exception, node_name: str, custom_message: Optional[str] = None) -> dict:
68
        error_msg = custom_message or f"Error in {node_name}: {str(error)}"
69
        logger.error(f"Node '{node_name}' failed: {str(error)}")
70

71
        return {
72
            "error_messages": [error_msg],
73
            "retry_count": state.retry_count + 1,
74
            "last_failed_node": node_name,
75
            "current_node": node_name
76
        }
77

78
    @staticmethod
79
    def should_retry(state: StateType) -> bool:
80
        return state.retry_count < state.max_retries and len(state.error_messages) > 0
81

82
    @staticmethod
83
    def clear_errors(state: StateType) -> dict[str, Any]:
84
        return {
85
            "error_messages": [],
86
            "retry_count": 0,
87
            "last_failed_node": None,
88
        }
89

90
    @staticmethod
91
    def get_error_summary(state: StateType) -> dict[str, Any]:
92
        return {
93
            "has_errors": len(state.error_messages) > 0,
94
            "error_count": len(state.error_messages),
95
            "retry_count": state.retry_count,
96
            "last_failed_node": state.last_failed_node,
97
            "can_retry": ErrorHandler.should_retry(state)
98
        }
99

100

101
def handle_node_errors(node_name: str, custom_message: Optional[str] = None):
102
    def decorator(func: Callable) -> Callable:
103
        @wraps(func)
104
        async def async_wrapper(self, state: StateType) -> dict[str, Any]:
105
            try:
106
                result = await func(self, state)
107
                if result is None:
108
                    result = {}
109
                result.update(ErrorHandler.clear_errors(state))
110
                return result
111
            except Exception as e:
112
                logger.exception(f"Error in async node '{node_name}'")
113
                return ErrorHandler.handle_error(state, e, node_name, custom_message)
114

115
        @wraps(func)
116
        def sync_wrapper(self, state: StateType) -> dict[str, Any]:
117
            try:
118
                result = func(self, state)
119
                if result is None:
120
                    result = {}
121
                result.update(ErrorHandler.clear_errors(state))
122
                return result
123
            except Exception as e:
124
                logger.exception(f"Error in sync node '{node_name}'")
125
                return ErrorHandler.handle_error(state, e, node_name, custom_message)
126

127
        if asyncio.iscoroutinefunction(func):
128
            return async_wrapper
129
        else:
130
            return sync_wrapper
131

132
    return decorator
133

134

135
# ============================================================================
136
# STATE DEFINITION
137
# ============================================================================
138

139
class ResearchState(BaseModel):
140
    """State for the research assistant workflow"""
141
    query: str = ""
142
    research_plan: str = ""
143
    search_results: list[str] = Field(default_factory=list)
144
    summary: str = ""
145
    final_report: str = ""
146

147
    # Error handling fields
148
    error_messages: list[str] = Field(default_factory=list)
149
    retry_count: int = 0
150
    max_retries: int = 3
151
    last_failed_node: Optional[str] = None
152
    current_node: Optional[str] = None
153

154
    # Control flow
155
    should_continue: bool = True
156

157
    class Config:
158
        arbitrary_types_allowed = True
159

160

161
# ============================================================================
162
# RESEARCH NODES
163
# ============================================================================
164

165
class ResearchNodes:
166
    """Collection of nodes for the research workflow"""
167

168
    def __init__(self, llm: Optional[ChatOllama] = None):
169
        self.llm = llm or ChatOllama(model="gpt-oss", temperature=0)
170

171
    @handle_node_errors("validate_query", "Failed to validate the research query")
172
    def validate_query(self, state: ResearchState) -> dict[str, Any]:
173
        """Validate that the query is appropriate for research"""
174
        logger.info(f"Validating query: {state.query}")
175

176
        if not state.query or len(state.query.strip()) < 5:
177
            raise ValueError("Query must be at least 5 characters long")
178

179
        # Simulate potential validation issues
180
        if "error" in state.query.lower():
181
            raise ValueError("Query contains forbidden terms")
182

183
        return {
184
            "current_node": "validate_query",
185
            "should_continue": True
186
        }
187

188
    @handle_node_errors("create_research_plan", "Failed to create research plan")
189
    async def create_research_plan(self, state: ResearchState) -> dict[str, Any]:
190
        """Create a research plan based on the query"""
191
        logger.info(f"Creating research plan for: {state.query}")
192

193
        messages = [
194
            SystemMessage(content="You are a research planning assistant. Create a brief 3-step research plan."),
195
            HumanMessage(content=f"Create a research plan for: {state.query}")
196
        ]
197

198
        response = await self.llm.ainvoke(messages)
199

200
        if not response.content:
201
            raise ValueError("LLM returned empty research plan")
202

203
        return {
204
            "research_plan": response.content,
205
            "current_node": "create_research_plan",
206
            "should_continue": True
207
        }
208

209
    @handle_node_errors("gather_information", "Failed to gather information")
210
    async def gather_information(self, state: ResearchState) -> dict[str, Any]:
211
        """Simulate gathering information from various sources"""
212
        logger.info("Gathering information...")
213

214
        # Simulate API calls that might fail
215
        await asyncio.sleep(0.5)
216

217
        # Simulate random failures for demonstration
218
        import random
219
        if random.random() < 0.2:  # 20% chance of failure
220
            raise ConnectionError("Failed to connect to research database")
221

222
        # Simulate search results
223
        search_results = [
224
            f"Research finding 1 about {state.query}",
225
            f"Research finding 2 about {state.query}",
226
            f"Research finding 3 about {state.query}",
227
        ]
228

229
        return {
230
            "search_results": search_results,
231
            "current_node": "gather_information",
232
            "should_continue": True
233
        }
234

235
    @handle_node_errors("synthesize_findings", "Failed to synthesize findings")
236
    async def synthesize_findings(self, state: ResearchState) -> dict[str, Any]:
237
        """Synthesize the gathered information into a summary"""
238
        logger.info("Synthesizing findings...")
239

240
        if not state.search_results:
241
            raise ValueError("No search results available to synthesize")
242

243
        findings_text = "\n".join(f"- {result}" for result in state.search_results)
244

245
        messages = [
246
            SystemMessage(content="You are a research synthesis assistant. Summarize the findings concisely."),
247
            HumanMessage(
248
                content=f"Research Plan:\n{state.research_plan}\n\nFindings:\n{findings_text}\n\nProvide a brief summary.")
249
        ]
250

251
        response = await self.llm.ainvoke(messages)
252

253
        return {
254
            "summary": response.content,
255
            "current_node": "synthesize_findings",
256
            "should_continue": True
257
        }
258

259
    @handle_node_errors("generate_report", "Failed to generate final report")
260
    async def generate_report(self, state: ResearchState) -> dict[str, Any]:
261
        """Generate the final research report"""
262
        logger.info("Generating final report...")
263

264
        messages = [
265
            SystemMessage(content="You are a report writing assistant. Create a concise final report."),
266
            HumanMessage(content=f"Query: {state.query}\n\nSummary: {state.summary}\n\nCreate a final report.")
267
        ]
268

269
        response = await self.llm.ainvoke(messages)
270

271
        return {
272
            "final_report": response.content,
273
            "current_node": "generate_report",
274
            "should_continue": False
275
        }
276

277

278
# ============================================================================
279
# ROUTING LOGIC
280
# ============================================================================
281

282

283
def create_universal_router(next_node: str, end_node: str = END):
284
    """Create a universal router that handles errors and retries"""
285

286
    def router(state) -> str:
287
        # Handle both dict and Pydantic model
288
        if isinstance(state, dict):
289
            error_messages = state.get('error_messages', [])
290
            retry_count = state.get('retry_count', 0)
291
            max_retries = state.get('max_retries', 3)
292
            last_failed_node = state.get('last_failed_node', 'validate_query')
293
        else:
294
            error_messages = state.error_messages
295
            retry_count = state.retry_count
296
            max_retries = state.max_retries
297
            last_failed_node = state.last_failed_node or 'validate_query'
298

299
        if len(error_messages) > 0:
300
            if retry_count < max_retries:
301
                logger.info(f"Retrying {last_failed_node}, attempt {retry_count}/{max_retries}")
302
                return last_failed_node
303
            else:
304
                logger.error(f"Max retries reached for {last_failed_node}, ending execution")
305
                return end_node
306
        else:
307
            return next_node
308

309
    return router
310

311

312
def should_retry_node(state) -> str:
313
    """Route back to the failed node for retry"""
314
    if isinstance(state, dict):
315
        last_failed = state.get('last_failed_node', 'validate_query')
316
    else:
317
        last_failed = state.last_failed_node or 'validate_query'
318

319
    logger.info(f"Routing to retry node: {last_failed}")
320
    return last_failed
321

322

323

324

325

326
# ============================================================================
327
# GRAPH CONSTRUCTION
328
# ============================================================================
329

330
def create_research_graph(llm: Optional[ChatOllama] = None) -> CompiledStateGraph:
331
    """Create the research assistant graph with error handling"""
332

333
    nodes = ResearchNodes(llm)
334

335
    # Create the graph
336
    workflow = StateGraph(ResearchState)
337

338
    # Add nodes
339
    workflow.add_node("validate_query", nodes.validate_query)
340
    workflow.add_node("create_research_plan", nodes.create_research_plan)
341
    workflow.add_node("gather_information", nodes.gather_information)
342
    workflow.add_node("synthesize_findings", nodes.synthesize_findings)
343
    workflow.add_node("generate_report", nodes.generate_report)
344

345
    # Set entry point
346
    workflow.set_entry_point("validate_query")
347

348
    # Add conditional edges using universal router
349
    # Router will automatically retry the failed node or move to next node
350
    workflow.add_conditional_edges(
351
        "validate_query",
352
        create_universal_router(next_node="create_research_plan")
353
    )
354
    workflow.add_conditional_edges(
355
        "create_research_plan",
356
        create_universal_router(next_node="gather_information")
357
    )
358
    workflow.add_conditional_edges(
359
        "gather_information",
360
        create_universal_router(next_node="synthesize_findings")
361
    )
362
    workflow.add_conditional_edges(
363
        "synthesize_findings",
364
        create_universal_router(next_node="generate_report")
365
    )
366
    workflow.add_conditional_edges(
367
        "generate_report",
368
        create_universal_router(next_node=END)
369
    )
370

371
    return workflow.compile()
372
# ============================================================================
373
# MAIN EXECUTION
374
# ============================================================================
375

376
async def main():
377
    """Run the research assistant"""
378

379
    print("=" * 80)
380
    print("RESEARCH ASSISTANT WITH ERROR HANDLING")
381
    print("=" * 80)
382

383
    # Create the graph
384
    graph = create_research_graph()
385

386
    # Test queries
387
    queries = [
388
        "What are the latest developments in quantum computing?",
389
        "err",  # This will fail validation (too short)
390
        "Impact of artificial intelligence on healthcare",
391
    ]
392

393
    for i, query in enumerate(queries, 1):
394
        print(f"\n{'=' * 80}")
395
        print(f"QUERY {i}: {query}")
396
        print(f"{'=' * 80}\n")
397

398
        initial_state = ResearchState(query=query)
399

400
        try:
401

402
            final_state = await graph.ainvoke(initial_state)
403

404
            # Display results
405
            print("\n" + "=" * 80)
406
            print("RESULTS")
407
            print("=" * 80)
408

409
            # final_state is a dict, not a ResearchState object
410
            error_messages = final_state.get("error_messages", [])
411

412
            if error_messages:
413
                print(f"\n❌ FAILED with errors:")
414
                for error in error_messages:
415
                    print(f"  - {error}")
416
                print(f"\nRetry count: {final_state.get('retry_count', 0)}/{final_state.get('max_retries', 3)}")
417
            else:
418
                print(f"\n✅ SUCCESS!")
419
                research_plan = final_state.get('research_plan', 'N/A')
420
                summary = final_state.get('summary', 'N/A')
421
                final_report = final_state.get('final_report', 'N/A')
422

423
                print(f"\nResearch Plan:\n{research_plan[:200] if research_plan != 'N/A' else research_plan}...")
424
                print(f"\nSummary:\n{summary[:200] if summary != 'N/A' else summary}...")
425
                print(f"\nFinal Report:\n{final_report[:300] if final_report != 'N/A' else final_report}...")
426

427
            # Show error summary (convert dict to object-like for ErrorHandler)
428
            print(f"\nError Summary:")
429
            print(f"  - Has errors: {len(error_messages) > 0}")
430
            print(f"  - Error count: {len(error_messages)}")
431
            print(f"  - Retry count: {final_state.get('retry_count', 0)}")
432
            print(f"  - Last failed node: {final_state.get('last_failed_node', 'None')}")
433

434

435

436
        except Exception as e:
437
            print(f"\n❌ Unexpected error: {e}")
438

439
        await asyncio.sleep(1)
440

441

442
if __name__ == "__main__":
443
    asyncio.run(main())
444
```