aamanlamba commited on
Commit
60ac2eb
Β·
1 Parent(s): 5bb0a78

first version - lineage extractor

Browse files
.env.example ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Example environment variables for Lineage Graph Extractor
2
+ # Copy this file to .env and fill in your actual values
3
+
4
+ # Anthropic API Key (for Claude AI agent)
5
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
6
+
7
+ # Google Cloud (for BigQuery integration)
8
+ GOOGLE_CLOUD_PROJECT=your-gcp-project-id
9
+ GOOGLE_APPLICATION_CREDENTIALS=path/to/service-account-key.json
10
+
11
+ # Optional: Custom API endpoints
12
+ # METADATA_API_URL=https://your-metadata-api.com
13
+
14
+ # Optional: MCP Server Configuration
15
+ # MCP_SERVER_URL=https://your-mcp-server.com
16
+ # MCP_API_KEY=your_mcp_api_key
17
+
.gitignore ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+ .venv
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+ *.so
9
+ .Python
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # Virtual environments
27
+ venv/
28
+ env/
29
+ ENV/
30
+
31
+ # IDE
32
+ .vscode/
33
+ .idea/
34
+ *.swp
35
+ *.swo
36
+ *~
37
+
38
+ # OS
39
+ .DS_Store
40
+ Thumbs.db
41
+
42
+ # Credentials
43
+ *.json
44
+ service-account-*.json
45
+ credentials.json
46
+
47
+ # Logs
48
+ *.log
49
+
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
DEPLOYMENT.md ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quick Deployment Guide
2
+
3
+ Follow these steps to deploy the Lineage Graph Extractor to Hugging Face Spaces:
4
+
5
+ ## Quick Start (5 minutes)
6
+
7
+ ### 1. Create Space
8
+ ```bash
9
+ # Go to: https://huggingface.co/new-space
10
+ # Choose: Gradio SDK
11
+ # Hardware: CPU Basic (free)
12
+ ```
13
+
14
+ ### 2. Upload Files
15
+ Upload these files from `/hf_space/` to your Space:
16
+ - βœ… `app.py`
17
+ - βœ… `requirements.txt`
18
+ - βœ… `README.md`
19
+ - ⚠️ `.env.example` (optional reference)
20
+ - ⚠️ `SETUP_GUIDE.md` (optional)
21
+
22
+ ### 3. Add Secrets
23
+ In Space Settings β†’ Repository Secrets, add:
24
+ - `ANTHROPIC_API_KEY` - Your Claude API key (**required**)
25
+ - `GOOGLE_CLOUD_PROJECT` - For BigQuery (optional)
26
+
27
+ ### 4. Wait for Build
28
+ - Space will automatically build (2-3 minutes)
29
+ - Check "Logs" tab for any errors
30
+ - Once ready, the app will be live!
31
+
32
+ ## Detailed Step-by-Step
33
+
34
+ ### Method 1: Web Interface (Easiest)
35
+
36
+ 1. **Create Space**
37
+ - Go to https://huggingface.co/spaces
38
+ - Click "Create new Space"
39
+ - Name: `lineage-graph-extractor`
40
+ - SDK: Gradio
41
+ - Click "Create Space"
42
+
43
+ 2. **Upload Files**
44
+ - Click "Files and versions"
45
+ - Click "Add file" β†’ "Upload files"
46
+ - Select all files from `/hf_space/`
47
+ - Click "Commit changes"
48
+
49
+ 3. **Configure Secrets**
50
+ - Click "Settings"
51
+ - Scroll to "Repository secrets"
52
+ - Add `ANTHROPIC_API_KEY` with your API key
53
+ - Save
54
+
55
+ 4. **Verify Deployment**
56
+ - Go to "App" tab
57
+ - Wait for build to complete
58
+ - Test the interface
59
+
60
+ ### Method 2: Git CLI (For Developers)
61
+
62
+ ```bash
63
+ # Clone your Space
64
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/lineage-graph-extractor
65
+ cd lineage-graph-extractor
66
+
67
+ # Copy files (adjust path to where you saved the files)
68
+ cp /path/to/hf_space/app.py .
69
+ cp /path/to/hf_space/requirements.txt .
70
+ cp /path/to/hf_space/README.md .
71
+
72
+ # Commit and push
73
+ git add .
74
+ git commit -m "Initial deployment"
75
+ git push
76
+ ```
77
+
78
+ Then add secrets via the web interface (Settings β†’ Repository secrets).
79
+
80
+ ### Method 3: Hugging Face CLI
81
+
82
+ ```bash
83
+ # Install Hugging Face CLI
84
+ pip install huggingface_hub
85
+
86
+ # Login
87
+ huggingface-cli login
88
+
89
+ # Create Space
90
+ huggingface-cli repo create lineage-graph-extractor --type space --space_sdk gradio
91
+
92
+ # Upload files
93
+ huggingface-cli upload lineage-graph-extractor /path/to/hf_space/ .
94
+ ```
95
+
96
+ ## Important: Connect Your Agent
97
+
98
+ ⚠️ **The template needs your agent integration!**
99
+
100
+ The `app.py` file contains placeholder functions. You need to integrate your actual agent:
101
+
102
+ ### Quick Integration Example
103
+
104
+ Edit `app.py` and replace the `extract_lineage_from_text` function:
105
+
106
+ ```python
107
+ import anthropic
108
+ import os
109
+
110
+ client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
111
+
112
+ def extract_lineage_from_text(metadata_text, source_type, viz_format):
113
+ """Extract lineage using Claude AI agent."""
114
+
115
+ prompt = f"""
116
+ You are a lineage extraction expert. Extract data lineage from this {source_type} metadata
117
+ and create a {viz_format} visualization.
118
+
119
+ Metadata:
120
+ {metadata_text}
121
+
122
+ Return:
123
+ 1. The visualization code
124
+ 2. A brief summary
125
+ """
126
+
127
+ response = client.messages.create(
128
+ model="claude-3-5-sonnet-20241022",
129
+ max_tokens=4000,
130
+ messages=[{"role": "user", "content": prompt}]
131
+ )
132
+
133
+ # Parse response to extract visualization and summary
134
+ text = response.content[0].text
135
+
136
+ # Simple parsing (improve this based on your needs)
137
+ parts = text.split("---")
138
+ visualization = parts[0] if len(parts) > 0 else text
139
+ summary = parts[1] if len(parts) > 1 else "Lineage extracted successfully"
140
+
141
+ return visualization.strip(), summary.strip()
142
+ ```
143
+
144
+ ### Using Agent Memory Files
145
+
146
+ To use your full agent configuration:
147
+
148
+ 1. Copy `/memories/` directory to Space:
149
+ ```bash
150
+ cp -r /memories /path/to/space/
151
+ ```
152
+
153
+ 2. Reference agent instructions in your code:
154
+ ```python
155
+ with open("memories/agent.md") as f:
156
+ agent_instructions = f.read()
157
+
158
+ # Use instructions in prompts
159
+ ```
160
+
161
+ ## Post-Deployment
162
+
163
+ ### Test Functionality
164
+ 1. βœ… Text/File extraction works
165
+ 2. βœ… BigQuery integration (if configured)
166
+ 3. βœ… URL fetching works
167
+ 4. βœ… Visualizations render correctly
168
+
169
+ ### Optimize Performance
170
+ - Upgrade hardware if needed (Settings β†’ Hardware)
171
+ - Add caching for repeated queries
172
+ - Implement rate limiting
173
+
174
+ ### Share Your Space
175
+ - Make it public (Settings β†’ Visibility)
176
+ - Share URL: `https://huggingface.co/spaces/YOUR_USERNAME/lineage-graph-extractor`
177
+ - Add to your profile or collection
178
+
179
+ ## Costs
180
+
181
+ - **Basic CPU**: Free forever βœ…
182
+ - **Upgraded CPU**: ~$0.03/hour
183
+ - **GPU**: ~$0.60/hour (if needed for heavy processing)
184
+ - **API costs**: Anthropic Claude API usage (pay-as-you-go)
185
+
186
+ ## Troubleshooting
187
+
188
+ ### Build Fails
189
+ - Check requirements.txt for incompatible versions
190
+ - Review logs for specific error messages
191
+ - Ensure Python 3.9+ compatibility
192
+
193
+ ### App Won't Load
194
+ - Verify `app.py` has no syntax errors
195
+ - Check that `demo.launch()` is called
196
+ - Review Space logs
197
+
198
+ ### API Errors
199
+ - Verify `ANTHROPIC_API_KEY` is set correctly
200
+ - Check API key has proper permissions
201
+ - Monitor API usage and rate limits
202
+
203
+ ### Visualization Issues
204
+ - Test Mermaid syntax at https://mermaid.live/
205
+ - Ensure proper code block formatting
206
+ - Check browser console for rendering errors
207
+
208
+ ## Support
209
+
210
+ - **Hugging Face Docs**: https://huggingface.co/docs/hub/spaces
211
+ - **Gradio Docs**: https://gradio.app/docs
212
+ - **Community Forum**: https://discuss.huggingface.co/
213
+
214
+ ## Next Steps
215
+
216
+ 1. βœ… Deploy to Hugging Face Spaces
217
+ 2. πŸ”§ Integrate your agent backend
218
+ 3. πŸ§ͺ Test with real metadata
219
+ 4. 🎨 Customize UI/UX
220
+ 5. πŸ“Š Add analytics
221
+ 6. πŸš€ Share with community
222
+
223
+ ---
224
+
225
+ **Ready to deploy?** Start with Method 1 (Web Interface) - it's the easiest!
226
+
LOCAL_SETUP.md ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local Setup Guide - Lineage Graph Extractor
2
+
3
+ This guide provides detailed instructions for setting up and running the Lineage Graph Extractor agent locally.
4
+
5
+ ## Table of Contents
6
+ 1. [System Requirements](#system-requirements)
7
+ 2. [Installation Methods](#installation-methods)
8
+ 3. [Configuration](#configuration)
9
+ 4. [Usage Scenarios](#usage-scenarios)
10
+ 5. [Advanced Configuration](#advanced-configuration)
11
+ 6. [Troubleshooting](#troubleshooting)
12
+
13
+ ## System Requirements
14
+
15
+ ### Minimum Requirements
16
+ - **OS**: Windows 10+, macOS 10.15+, or Linux
17
+ - **Python**: 3.9 or higher
18
+ - **Memory**: 2GB RAM minimum
19
+ - **Disk Space**: 100MB for agent files
20
+
21
+ ### Recommended Requirements
22
+ - **Python**: 3.10+
23
+ - **Memory**: 4GB RAM
24
+ - **Internet**: Stable connection for API calls
25
+
26
+ ## Installation Methods
27
+
28
+ ### Method 1: Standalone Use (Recommended)
29
+
30
+ This method uses the agent configuration files with any platform that supports the Anthropic API.
31
+
32
+ 1. **Download the agent**
33
+ ```bash
34
+ # If you have a git repository
35
+ git clone <repository-url>
36
+ cd local_clone
37
+
38
+ # Or extract from downloaded archive
39
+ unzip lineage-graph-extractor.zip
40
+ cd lineage-graph-extractor
41
+ ```
42
+
43
+ 2. **Set up environment**
44
+ ```bash
45
+ # Copy environment template
46
+ cp .env.example .env
47
+ ```
48
+
49
+ 3. **Edit .env file**
50
+ ```bash
51
+ # Edit with your preferred editor
52
+ nano .env
53
+ # or
54
+ vim .env
55
+ # or
56
+ code .env # VS Code
57
+ ```
58
+
59
+ Add your credentials:
60
+ ```
61
+ ANTHROPIC_API_KEY=sk-ant-your-key-here
62
+ GOOGLE_CLOUD_PROJECT=your-gcp-project
63
+ GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json
64
+ ```
65
+
66
+ 4. **Install Python dependencies** (optional, for examples)
67
+ ```bash
68
+ pip install anthropic google-cloud-bigquery requests pyyaml
69
+ ```
70
+
71
+ ### Method 2: Claude Desktop Integration
72
+
73
+ If you're using Claude Desktop or similar platforms:
74
+
75
+ 1. **Locate your agent configuration directory**
76
+ - Claude Desktop: `~/.config/claude/agents/` (Linux/Mac) or `%APPDATA%\claude\agents\` (Windows)
77
+ - Other platforms: Check platform documentation
78
+
79
+ 2. **Copy the memories folder**
80
+ ```bash
81
+ # Linux/Mac
82
+ cp -r memories ~/.config/claude/agents/lineage-extractor/
83
+
84
+ # Windows
85
+ xcopy /E /I memories %APPDATA%\claude\agents\lineage-extractor\
86
+ ```
87
+
88
+ 3. **Configure API credentials** in your platform's settings
89
+
90
+ 4. **Restart the application**
91
+
92
+ ### Method 3: Python Integration
93
+
94
+ To integrate into your own Python application:
95
+
96
+ 1. **Install dependencies**
97
+ ```bash
98
+ pip install anthropic python-dotenv
99
+ ```
100
+
101
+ 2. **Use the integration example**
102
+ ```python
103
+ from anthropic import Anthropic
104
+ from dotenv import load_dotenv
105
+ import os
106
+
107
+ # Load environment variables
108
+ load_dotenv()
109
+
110
+ # Initialize client
111
+ client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
112
+
113
+ # Load agent configuration
114
+ with open("memories/agent.md", "r") as f:
115
+ system_prompt = f.read()
116
+
117
+ # Use the agent
118
+ response = client.messages.create(
119
+ model="claude-3-5-sonnet-20241022",
120
+ max_tokens=4000,
121
+ system=system_prompt,
122
+ messages=[{
123
+ "role": "user",
124
+ "content": "Extract lineage from this metadata: ..."
125
+ }]
126
+ )
127
+
128
+ print(response.content[0].text)
129
+ ```
130
+
131
+ ## Configuration
132
+
133
+ ### API Keys Setup
134
+
135
+ #### Anthropic API Key
136
+ 1. Go to https://console.anthropic.com/
137
+ 2. Create an account or sign in
138
+ 3. Navigate to API Keys
139
+ 4. Create a new key
140
+ 5. Copy to `.env` file
141
+
142
+ #### Google Cloud (for BigQuery)
143
+ 1. Go to https://console.cloud.google.com/
144
+ 2. Create a project or select existing
145
+ 3. Enable BigQuery API
146
+ 4. Create a service account:
147
+ - Go to IAM & Admin β†’ Service Accounts
148
+ - Create service account
149
+ - Grant "BigQuery Data Viewer" role
150
+ - Create JSON key
151
+ 5. Download JSON and reference in `.env`
152
+
153
+ #### Tavily (for web search)
154
+ 1. Go to https://tavily.com/
155
+ 2. Sign up for an account
156
+ 3. Get your API key
157
+ 4. Add to `.env` file
158
+
159
+ ### Tool Configuration
160
+
161
+ Edit `memories/tools.json` to customize available tools:
162
+
163
+ ```json
164
+ {
165
+ "tools": [
166
+ "bigquery_execute_query", // Query BigQuery
167
+ "read_url_content", // Fetch from URLs
168
+ "google_sheets_read_range", // Read Google Sheets
169
+ "tavily_web_search" // Web search
170
+ ],
171
+ "interrupt_config": {
172
+ "bigquery_execute_query": false,
173
+ "read_url_content": false,
174
+ "google_sheets_read_range": false,
175
+ "tavily_web_search": false
176
+ }
177
+ }
178
+ ```
179
+
180
+ **Available Tools:**
181
+ - `bigquery_execute_query`: Execute SQL queries on BigQuery
182
+ - `read_url_content`: Fetch content from URLs/APIs
183
+ - `google_sheets_read_range`: Read data from Google Sheets
184
+ - `tavily_web_search`: Perform web searches
185
+
186
+ ### Subagent Configuration
187
+
188
+ Customize subagents by editing their configuration files:
189
+
190
+ **Metadata Parser** (`memories/subagents/metadata_parser/`)
191
+ - `agent.md`: Instructions for parsing metadata
192
+ - `tools.json`: Tools available to parser
193
+
194
+ **Graph Visualizer** (`memories/subagents/graph_visualizer/`)
195
+ - `agent.md`: Instructions for creating visualizations
196
+ - `tools.json`: Tools available to visualizer
197
+
198
+ ## Usage Scenarios
199
+
200
+ ### Scenario 1: BigQuery Lineage Extraction
201
+
202
+ ```python
203
+ from anthropic import Anthropic
204
+ import os
205
+
206
+ client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
207
+
208
+ with open("memories/agent.md", "r") as f:
209
+ system_prompt = f.read()
210
+
211
+ response = client.messages.create(
212
+ model="claude-3-5-sonnet-20241022",
213
+ max_tokens=4000,
214
+ system=system_prompt,
215
+ messages=[{
216
+ "role": "user",
217
+ "content": "Extract lineage from BigQuery project: my-project, dataset: analytics"
218
+ }]
219
+ )
220
+
221
+ print(response.content[0].text)
222
+ ```
223
+
224
+ ### Scenario 2: File-Based Metadata
225
+
226
+ ```python
227
+ # Read metadata from file
228
+ with open("dbt_manifest.json", "r") as f:
229
+ metadata = f.read()
230
+
231
+ response = client.messages.create(
232
+ model="claude-3-5-sonnet-20241022",
233
+ max_tokens=4000,
234
+ system=system_prompt,
235
+ messages=[{
236
+ "role": "user",
237
+ "content": f"Extract lineage from this dbt manifest:\n\n{metadata}"
238
+ }]
239
+ )
240
+ ```
241
+
242
+ ### Scenario 3: API Metadata
243
+
244
+ ```python
245
+ response = client.messages.create(
246
+ model="claude-3-5-sonnet-20241022",
247
+ max_tokens=4000,
248
+ system=system_prompt,
249
+ messages=[{
250
+ "role": "user",
251
+ "content": "Extract lineage from API: https://api.example.com/metadata"
252
+ }]
253
+ )
254
+ ```
255
+
256
+ ## Advanced Configuration
257
+
258
+ ### Custom Visualization Formats
259
+
260
+ To add custom visualization formats, edit `memories/subagents/graph_visualizer/agent.md`:
261
+
262
+ ```markdown
263
+ ### 4. Custom Format
264
+ Generate a custom format with:
265
+ - Your specific requirements
266
+ - Custom styling rules
267
+ - Special formatting needs
268
+ ```
269
+
270
+ ### Adding New Metadata Sources
271
+
272
+ To support new metadata sources:
273
+
274
+ 1. Add tool to `memories/tools.json`
275
+ 2. Update `memories/agent.md` with source-specific instructions
276
+ 3. Update `memories/subagents/metadata_parser/agent.md` if needed
277
+
278
+ ### MCP Integration
279
+
280
+ To integrate with Model Context Protocol servers:
281
+
282
+ 1. Check if MCP tools are available: `/tools` directory
283
+ 2. Add MCP tools to `memories/tools.json`
284
+ 3. Configure MCP server connection
285
+ 4. See `memories/mcp_integration.md` (if available)
286
+
287
+ ## Troubleshooting
288
+
289
+ ### Common Issues
290
+
291
+ #### 1. Authentication Errors
292
+
293
+ **Problem**: API authentication fails
294
+ **Solutions**:
295
+ - Verify API key is correct in `.env`
296
+ - Check key hasn't expired
297
+ - Ensure environment variables are loaded
298
+ - Try regenerating the API key
299
+
300
+ ```bash
301
+ # Test Anthropic API key
302
+ python -c "from anthropic import Anthropic; import os; from dotenv import load_dotenv; load_dotenv(); client = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY')); print('βœ“ API key works')"
303
+ ```
304
+
305
+ #### 2. BigQuery Access Issues
306
+
307
+ **Problem**: Cannot access BigQuery
308
+ **Solutions**:
309
+ - Verify service account has BigQuery permissions
310
+ - Check project ID is correct
311
+ - Ensure JSON key file path is correct
312
+ - Test credentials:
313
+
314
+ ```bash
315
+ # Test BigQuery access
316
+ gcloud auth activate-service-account --key-file=/path/to/key.json
317
+ bq ls --project_id=your-project-id
318
+ ```
319
+
320
+ #### 3. Import Errors
321
+
322
+ **Problem**: `ModuleNotFoundError`
323
+ **Solutions**:
324
+ ```bash
325
+ # Install missing packages
326
+ pip install anthropic google-cloud-bigquery requests pyyaml python-dotenv
327
+
328
+ # Or install all at once
329
+ pip install -r requirements.txt # if you create one
330
+ ```
331
+
332
+ #### 4. Environment Variables Not Loading
333
+
334
+ **Problem**: `.env` file not being read
335
+ **Solutions**:
336
+ ```python
337
+ # Explicitly load .env
338
+ from dotenv import load_dotenv
339
+ load_dotenv()
340
+
341
+ # Or specify path
342
+ load_dotenv(".env")
343
+
344
+ # Verify loading
345
+ import os
346
+ print(os.getenv("ANTHROPIC_API_KEY")) # Should not be None
347
+ ```
348
+
349
+ #### 5. File Path Issues
350
+
351
+ **Problem**: Cannot find `memories/agent.md`
352
+ **Solutions**:
353
+ ```python
354
+ # Use absolute path
355
+ import os
356
+ base_dir = os.path.dirname(os.path.abspath(__file__))
357
+ agent_path = os.path.join(base_dir, "memories", "agent.md")
358
+
359
+ # Or change working directory
360
+ os.chdir("/path/to/local_clone")
361
+ ```
362
+
363
+ ### Performance Issues
364
+
365
+ #### Slow Response Times
366
+
367
+ **Causes**:
368
+ - Large metadata files
369
+ - Complex lineage graphs
370
+ - Network latency
371
+
372
+ **Solutions**:
373
+ - Break large metadata into chunks
374
+ - Use filtering to focus on specific entities
375
+ - Increase API timeout settings
376
+ - Cache frequently used results
377
+
378
+ ### Debugging Tips
379
+
380
+ 1. **Enable verbose logging**
381
+ ```python
382
+ import logging
383
+ logging.basicConfig(level=logging.DEBUG)
384
+ ```
385
+
386
+ 2. **Test each component separately**
387
+ - Test API connection first
388
+ - Test metadata retrieval
389
+ - Test parsing separately
390
+ - Test visualization separately
391
+
392
+ 3. **Validate metadata format**
393
+ - Ensure JSON is valid
394
+ - Check for required fields
395
+ - Verify structure matches expected format
396
+
397
+ 4. **Check agent configuration**
398
+ - Verify `memories/agent.md` is readable
399
+ - Check `tools.json` syntax
400
+ - Ensure subagent files exist
401
+
402
+ ## Getting Help
403
+
404
+ ### Documentation
405
+ - Agent instructions: `memories/agent.md`
406
+ - Subagent docs: `memories/subagents/*/agent.md`
407
+ - Anthropic API: https://docs.anthropic.com/
408
+
409
+ ### Testing Your Setup
410
+
411
+ Run this complete test:
412
+
413
+ ```python
414
+ from anthropic import Anthropic
415
+ from dotenv import load_dotenv
416
+ import os
417
+
418
+ # Load environment
419
+ load_dotenv()
420
+
421
+ # Test 1: API Connection
422
+ try:
423
+ client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
424
+ print("βœ“ Anthropic API connection successful")
425
+ except Exception as e:
426
+ print(f"βœ— API connection failed: {e}")
427
+ exit(1)
428
+
429
+ # Test 2: Load Agent Config
430
+ try:
431
+ with open("memories/agent.md", "r") as f:
432
+ system_prompt = f.read()
433
+ print("βœ“ Agent configuration loaded")
434
+ except Exception as e:
435
+ print(f"βœ— Failed to load agent config: {e}")
436
+ exit(1)
437
+
438
+ # Test 3: Simple Query
439
+ try:
440
+ response = client.messages.create(
441
+ model="claude-3-5-sonnet-20241022",
442
+ max_tokens=1000,
443
+ system=system_prompt,
444
+ messages=[{
445
+ "role": "user",
446
+ "content": "Hello, what can you help me with?"
447
+ }]
448
+ )
449
+ print("βœ“ Agent response successful")
450
+ print(f"\nAgent says: {response.content[0].text}")
451
+ except Exception as e:
452
+ print(f"βœ— Agent query failed: {e}")
453
+ exit(1)
454
+
455
+ print("\nβœ“ All tests passed! Your setup is ready.")
456
+ ```
457
+
458
+ Save as `test_setup.py` and run:
459
+ ```bash
460
+ python test_setup.py
461
+ ```
462
+
463
+ ## Next Steps
464
+
465
+ 1. βœ… Complete setup
466
+ 2. βœ… Test with sample metadata
467
+ 3. πŸ“Š Extract your first lineage
468
+ 4. 🎨 Customize visualization preferences
469
+ 5. πŸ”§ Integrate with your workflow
470
+
471
+ ---
472
+
473
+ **Setup complete?** Try the usage examples in README.md or run your own lineage extraction!
474
+
SETUP_GUIDE.md ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Setup Guide for Lineage Graph Extractor Space
2
+
3
+ This guide will help you deploy the Lineage Graph Extractor as a Hugging Face Space.
4
+
5
+ ## Prerequisites
6
+
7
+ 1. A Hugging Face account (create one at https://huggingface.co/join)
8
+ 2. API credentials for the services you want to integrate:
9
+ - Anthropic API key (for Claude AI)
10
+ - Google Cloud credentials (for BigQuery, optional)
11
+ - Other service credentials as needed
12
+
13
+ ## Step 1: Create a New Space
14
+
15
+ 1. Go to https://huggingface.co/spaces
16
+ 2. Click "Create new Space"
17
+ 3. Fill in the details:
18
+ - **Name**: `lineage-graph-extractor` (or your preferred name)
19
+ - **License**: MIT (or your choice)
20
+ - **SDK**: Gradio
21
+ - **Hardware**: CPU Basic (free tier) or upgrade for better performance
22
+ - **Visibility**: Public or Private (your choice)
23
+
24
+ ## Step 2: Upload Files
25
+
26
+ You need to upload these files to your Space repository:
27
+
28
+ ### Required Files
29
+ - `app.py` - Main application file
30
+ - `requirements.txt` - Python dependencies
31
+ - `README.md` - Space description and documentation
32
+
33
+ ### Optional Files
34
+ - `.env.example` - Example environment variables
35
+ - `SETUP_GUIDE.md` - This setup guide
36
+
37
+ ### Upload Methods
38
+
39
+ **Option A: Web Interface**
40
+ 1. Click "Files and versions" in your Space
41
+ 2. Click "Add file" β†’ "Upload files"
42
+ 3. Upload all the files from `/hf_space/` directory
43
+
44
+ **Option B: Git**
45
+ ```bash
46
+ # Clone your Space repository
47
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/lineage-graph-extractor
48
+ cd lineage-graph-extractor
49
+
50
+ # Copy files
51
+ cp /path/to/hf_space/* .
52
+
53
+ # Commit and push
54
+ git add .
55
+ git commit -m "Initial commit: Lineage Graph Extractor"
56
+ git push
57
+ ```
58
+
59
+ ## Step 3: Configure Secrets
60
+
61
+ For security, store sensitive credentials as Space secrets:
62
+
63
+ 1. Go to your Space settings
64
+ 2. Click "Repository secrets"
65
+ 3. Add the following secrets:
66
+
67
+ ### Required Secrets
68
+ - `ANTHROPIC_API_KEY`: Your Claude API key from https://console.anthropic.com/
69
+
70
+ ### Optional Secrets (based on features you need)
71
+ - `GOOGLE_CLOUD_PROJECT`: Your GCP project ID
72
+ - `GOOGLE_APPLICATION_CREDENTIALS_JSON`: Service account JSON (as a string)
73
+ - `MCP_SERVER_URL`: MCP server endpoint (if using MCP)
74
+ - `MCP_API_KEY`: MCP authentication key
75
+
76
+ ### Accessing Secrets in Code
77
+
78
+ Update `app.py` to read from environment variables:
79
+
80
+ ```python
81
+ import os
82
+
83
+ ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
84
+ GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT")
85
+ ```
86
+
87
+ ## Step 4: Integrate the Agent Backend
88
+
89
+ The current `app.py` is a template. You need to connect it to your actual agent:
90
+
91
+ ### Option A: Use Anthropic SDK
92
+
93
+ ```python
94
+ import anthropic
95
+
96
+ client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
97
+
98
+ def extract_lineage_from_text(metadata_text, source_type, viz_format):
99
+ # Call your agent with metadata_parser and graph_visualizer workers
100
+ response = client.messages.create(
101
+ model="claude-3-5-sonnet-20241022",
102
+ max_tokens=4000,
103
+ messages=[{
104
+ "role": "user",
105
+ "content": f"Extract lineage from this {source_type} metadata and visualize as {viz_format}: {metadata_text}"
106
+ }]
107
+ )
108
+ return response.content[0].text, "Processed successfully"
109
+ ```
110
+
111
+ ### Option B: Use Agent API Endpoint
112
+
113
+ If you have your agent deployed as an API:
114
+
115
+ ```python
116
+ import requests
117
+
118
+ def extract_lineage_from_text(metadata_text, source_type, viz_format):
119
+ response = requests.post(
120
+ "https://your-agent-api.com/extract",
121
+ json={
122
+ "metadata": metadata_text,
123
+ "source_type": source_type,
124
+ "format": viz_format
125
+ }
126
+ )
127
+ return response.json()["visualization"], response.json()["summary"]
128
+ ```
129
+
130
+ ### Option C: Bundle Agent Files
131
+
132
+ Include your agent configuration directly in the Space:
133
+
134
+ 1. Copy `/memories/` directory to Space
135
+ 2. Copy `/subagents/` if needed
136
+ 3. Import and use agent logic in `app.py`
137
+
138
+ ## Step 5: Test Your Space
139
+
140
+ 1. Once deployed, Hugging Face will automatically build and run your Space
141
+ 2. Check the "Logs" tab for any errors
142
+ 3. Test each feature:
143
+ - Text/File metadata extraction
144
+ - BigQuery integration (if configured)
145
+ - URL/API fetching
146
+
147
+ ## Step 6: Customize and Enhance
148
+
149
+ ### Add Authentication
150
+
151
+ For production use, add authentication:
152
+
153
+ ```python
154
+ demo.launch(auth=("username", "password"))
155
+ ```
156
+
157
+ Or integrate with Hugging Face authentication:
158
+
159
+ ```python
160
+ demo.launch(auth_required=True)
161
+ ```
162
+
163
+ ### Improve Error Handling
164
+
165
+ Add try-catch blocks and user-friendly error messages:
166
+
167
+ ```python
168
+ try:
169
+ result = extract_lineage_from_text(metadata_text, source_type, viz_format)
170
+ return result
171
+ except Exception as e:
172
+ return "", f"Error: {str(e)}"
173
+ ```
174
+
175
+ ### Add More Features
176
+
177
+ - File upload support
178
+ - Export visualizations as images
179
+ - History/session management
180
+ - Batch processing
181
+
182
+ ## Troubleshooting
183
+
184
+ ### Space won't start
185
+ - Check logs for error messages
186
+ - Verify all dependencies in `requirements.txt`
187
+ - Ensure Python version compatibility
188
+
189
+ ### API errors
190
+ - Verify secrets are correctly set
191
+ - Check API key validity and permissions
192
+ - Review rate limits
193
+
194
+ ### Slow performance
195
+ - Upgrade to better hardware (CPU or GPU)
196
+ - Optimize metadata parsing logic
197
+ - Add caching for repeated queries
198
+
199
+ ## Security Best Practices
200
+
201
+ 1. **Never commit API keys** to the repository
202
+ 2. **Use Space secrets** for all credentials
203
+ 3. **Validate user input** to prevent injection attacks
204
+ 4. **Use read-only credentials** when possible
205
+ 5. **Add rate limiting** to prevent abuse
206
+ 6. **Enable authentication** for production use
207
+
208
+ ## Getting Help
209
+
210
+ - Hugging Face Spaces docs: https://huggingface.co/docs/hub/spaces
211
+ - Gradio documentation: https://gradio.app/docs
212
+ - Anthropic API docs: https://docs.anthropic.com/
213
+
214
+ ## Next Steps
215
+
216
+ 1. Test the Space thoroughly
217
+ 2. Share with your team or community
218
+ 3. Collect feedback and iterate
219
+ 4. Consider upgrading hardware for production workloads
220
+ 5. Add analytics to track usage
221
+
222
+ ---
223
+
224
+ **Need help?** Check the Hugging Face community forums or reach out to support.
225
+
app.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Lineage Graph Extractor - Hugging Face Space
3
+ A Gradio-based web interface for extracting and visualizing data lineage from various sources.
4
+ """
5
+
6
+ import gradio as gr
7
+ import json
8
+ import os
9
+ from typing import Optional, Tuple
10
+
11
+ # Note: This is a template. You'll need to integrate with your actual agent backend.
12
+ # This could be through an API, Claude SDK, or other agent framework.
13
+
14
+ def extract_lineage_from_text(
15
+ metadata_text: str,
16
+ source_type: str,
17
+ visualization_format: str
18
+ ) -> Tuple[str, str]:
19
+ """
20
+ Extract lineage from provided metadata text.
21
+
22
+ Args:
23
+ metadata_text: Raw metadata content
24
+ source_type: Type of metadata source (BigQuery, dbt, Airflow, etc.)
25
+ visualization_format: Desired output format (Mermaid, DOT, Text)
26
+
27
+ Returns:
28
+ Tuple of (visualization_code, summary_text)
29
+ """
30
+ # TODO: Integrate with your agent backend
31
+ # This is where you'd call your agent with the metadata_parser and graph_visualizer workers
32
+
33
+ return (
34
+ "graph TD\n A[Sample Node] --> B[Output Node]",
35
+ f"Processed {source_type} metadata. Found X nodes and Y relationships."
36
+ )
37
+
38
+ def extract_lineage_from_bigquery(
39
+ project_id: str,
40
+ query: str,
41
+ api_key: str,
42
+ visualization_format: str
43
+ ) -> Tuple[str, str]:
44
+ """
45
+ Extract lineage from BigQuery.
46
+
47
+ Args:
48
+ project_id: Google Cloud project ID
49
+ query: SQL query to extract metadata
50
+ api_key: API credentials
51
+ visualization_format: Desired output format
52
+
53
+ Returns:
54
+ Tuple of (visualization_code, summary_text)
55
+ """
56
+ # TODO: Integrate with BigQuery and your agent backend
57
+
58
+ return (
59
+ "graph TD\n A[BigQuery Table] --> B[Destination Table]",
60
+ f"Extracted lineage from BigQuery project: {project_id}"
61
+ )
62
+
63
+ def extract_lineage_from_url(
64
+ url: str,
65
+ visualization_format: str
66
+ ) -> Tuple[str, str]:
67
+ """
68
+ Extract lineage from URL/API endpoint.
69
+
70
+ Args:
71
+ url: URL to fetch metadata from
72
+ visualization_format: Desired output format
73
+
74
+ Returns:
75
+ Tuple of (visualization_code, summary_text)
76
+ """
77
+ # TODO: Integrate with URL fetching and your agent backend
78
+
79
+ return (
80
+ "graph TD\n A[API Source] --> B[Data Pipeline]",
81
+ f"Extracted lineage from URL: {url}"
82
+ )
83
+
84
+ # Create Gradio interface
85
+ with gr.Blocks(title="Lineage Graph Extractor", theme=gr.themes.Soft()) as demo:
86
+ gr.Markdown("""
87
+ # πŸ” Lineage Graph Extractor
88
+
89
+ Extract and visualize data lineage from various metadata sources including BigQuery, dbt, Airflow,
90
+ APIs, and more. This tool helps you understand complex data relationships through clear graph visualizations.
91
+
92
+ ## Supported Sources
93
+ - **Text/File Metadata**: Paste metadata directly
94
+ - **BigQuery**: Query table metadata and relationships
95
+ - **URLs/APIs**: Fetch metadata from web endpoints
96
+ - **dbt, Airflow, Snowflake**: Through MCP integration (when configured)
97
+ """)
98
+
99
+ with gr.Tabs():
100
+ # Tab 1: Text/File Input
101
+ with gr.Tab("Text/File Metadata"):
102
+ with gr.Row():
103
+ with gr.Column():
104
+ metadata_input = gr.Textbox(
105
+ label="Metadata Content",
106
+ placeholder="Paste your metadata here (JSON, YAML, SQL, etc.)",
107
+ lines=15
108
+ )
109
+ source_type_text = gr.Dropdown(
110
+ choices=["dbt Manifest", "Airflow DAG", "SQL DDL", "Custom JSON", "Other"],
111
+ label="Source Type",
112
+ value="Custom JSON"
113
+ )
114
+ viz_format_text = gr.Dropdown(
115
+ choices=["Mermaid", "DOT/Graphviz", "Text", "All"],
116
+ label="Visualization Format",
117
+ value="Mermaid"
118
+ )
119
+ extract_btn_text = gr.Button("Extract Lineage", variant="primary")
120
+
121
+ with gr.Column():
122
+ output_viz_text = gr.Code(
123
+ label="Lineage Visualization",
124
+ language="mermaid"
125
+ )
126
+ output_summary_text = gr.Textbox(
127
+ label="Summary",
128
+ lines=5
129
+ )
130
+
131
+ extract_btn_text.click(
132
+ fn=extract_lineage_from_text,
133
+ inputs=[metadata_input, source_type_text, viz_format_text],
134
+ outputs=[output_viz_text, output_summary_text]
135
+ )
136
+
137
+ # Tab 2: BigQuery
138
+ with gr.Tab("BigQuery"):
139
+ with gr.Row():
140
+ with gr.Column():
141
+ bq_project = gr.Textbox(
142
+ label="Project ID",
143
+ placeholder="your-gcp-project-id"
144
+ )
145
+ bq_query = gr.Textbox(
146
+ label="Metadata Query",
147
+ placeholder="SELECT * FROM `project.dataset.INFORMATION_SCHEMA.TABLES`",
148
+ lines=8
149
+ )
150
+ bq_api_key = gr.Textbox(
151
+ label="API Key / Credentials",
152
+ placeholder="Enter your credentials",
153
+ type="password"
154
+ )
155
+ viz_format_bq = gr.Dropdown(
156
+ choices=["Mermaid", "DOT/Graphviz", "Text", "All"],
157
+ label="Visualization Format",
158
+ value="Mermaid"
159
+ )
160
+ extract_btn_bq = gr.Button("Extract Lineage", variant="primary")
161
+
162
+ with gr.Column():
163
+ output_viz_bq = gr.Code(
164
+ label="Lineage Visualization",
165
+ language="mermaid"
166
+ )
167
+ output_summary_bq = gr.Textbox(
168
+ label="Summary",
169
+ lines=5
170
+ )
171
+
172
+ extract_btn_bq.click(
173
+ fn=extract_lineage_from_bigquery,
174
+ inputs=[bq_project, bq_query, bq_api_key, viz_format_bq],
175
+ outputs=[output_viz_bq, output_summary_bq]
176
+ )
177
+
178
+ # Tab 3: URL/API
179
+ with gr.Tab("URL/API"):
180
+ with gr.Row():
181
+ with gr.Column():
182
+ url_input = gr.Textbox(
183
+ label="URL",
184
+ placeholder="https://api.example.com/metadata"
185
+ )
186
+ viz_format_url = gr.Dropdown(
187
+ choices=["Mermaid", "DOT/Graphviz", "Text", "All"],
188
+ label="Visualization Format",
189
+ value="Mermaid"
190
+ )
191
+ extract_btn_url = gr.Button("Extract Lineage", variant="primary")
192
+
193
+ with gr.Column():
194
+ output_viz_url = gr.Code(
195
+ label="Lineage Visualization",
196
+ language="mermaid"
197
+ )
198
+ output_summary_url = gr.Textbox(
199
+ label="Summary",
200
+ lines=5
201
+ )
202
+
203
+ extract_btn_url.click(
204
+ fn=extract_lineage_from_url,
205
+ inputs=[url_input, viz_format_url],
206
+ outputs=[output_viz_url, output_summary_url]
207
+ )
208
+
209
+ gr.Markdown("""
210
+ ---
211
+ ## About
212
+
213
+ This tool uses AI-powered metadata parsing to extract lineage relationships and generate clear visualizations.
214
+
215
+ ### Features
216
+ - Multi-source metadata support
217
+ - Automatic relationship detection
218
+ - Multiple visualization formats
219
+ - MCP (Model Context Protocol) integration support
220
+
221
+ ### Note
222
+ To use BigQuery or other cloud services, you'll need to configure appropriate API credentials.
223
+ For MCP integration with dbt, Airflow, Snowflake, etc., additional setup is required.
224
+ """)
225
+
226
+ # Launch the app
227
+ if __name__ == "__main__":
228
+ demo.launch()
integration_example.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Lineage Graph Extractor - Integration Example
4
+
5
+ This script demonstrates how to use the Lineage Graph Extractor agent
6
+ programmatically with the Anthropic API.
7
+
8
+ Usage:
9
+ python integration_example.py
10
+ """
11
+
12
+ import os
13
+ from anthropic import Anthropic
14
+ from dotenv import load_dotenv
15
+
16
+ # Load environment variables from .env file
17
+ load_dotenv()
18
+
19
+ def load_agent_config():
20
+ """Load the agent configuration from memories/agent.md"""
21
+ config_path = os.path.join(os.path.dirname(__file__), "memories", "agent.md")
22
+
23
+ with open(config_path, "r") as f:
24
+ return f.read()
25
+
26
+ def extract_lineage(client, system_prompt, user_message):
27
+ """
28
+ Send a lineage extraction request to the agent.
29
+
30
+ Args:
31
+ client: Anthropic client instance
32
+ system_prompt: Agent system prompt
33
+ user_message: User's lineage extraction request
34
+
35
+ Returns:
36
+ Agent's response text
37
+ """
38
+ response = client.messages.create(
39
+ model="claude-3-5-sonnet-20241022",
40
+ max_tokens=4000,
41
+ system=system_prompt,
42
+ messages=[{
43
+ "role": "user",
44
+ "content": user_message
45
+ }]
46
+ )
47
+
48
+ return response.content[0].text
49
+
50
+ def main():
51
+ """Main function demonstrating agent usage"""
52
+
53
+ # Initialize Anthropic client
54
+ api_key = os.getenv("ANTHROPIC_API_KEY")
55
+ if not api_key:
56
+ print("Error: ANTHROPIC_API_KEY not found in environment variables.")
57
+ print("Please set it in your .env file.")
58
+ return
59
+
60
+ client = Anthropic(api_key=api_key)
61
+
62
+ # Load agent configuration
63
+ print("Loading agent configuration...")
64
+ system_prompt = load_agent_config()
65
+ print("βœ“ Agent configuration loaded\n")
66
+
67
+ # Example 1: Simple greeting to test agent
68
+ print("=" * 60)
69
+ print("Example 1: Testing agent connection")
70
+ print("=" * 60)
71
+ response = extract_lineage(
72
+ client,
73
+ system_prompt,
74
+ "Hello! What can you help me with?"
75
+ )
76
+ print(response)
77
+ print()
78
+
79
+ # Example 2: Extract lineage from sample metadata
80
+ print("=" * 60)
81
+ print("Example 2: Extract lineage from sample metadata")
82
+ print("=" * 60)
83
+
84
+ sample_metadata = """
85
+ {
86
+ "tables": [
87
+ {
88
+ "name": "raw_orders",
89
+ "type": "source",
90
+ "description": "Raw order data from API"
91
+ },
92
+ {
93
+ "name": "raw_customers",
94
+ "type": "source",
95
+ "description": "Raw customer data from database"
96
+ },
97
+ {
98
+ "name": "stg_orders",
99
+ "type": "staging",
100
+ "description": "Cleaned and standardized orders",
101
+ "depends_on": ["raw_orders"]
102
+ },
103
+ {
104
+ "name": "stg_customers",
105
+ "type": "staging",
106
+ "description": "Cleaned and standardized customers",
107
+ "depends_on": ["raw_customers"]
108
+ },
109
+ {
110
+ "name": "fct_orders",
111
+ "type": "fact",
112
+ "description": "Order facts with customer data",
113
+ "depends_on": ["stg_orders", "stg_customers"]
114
+ }
115
+ ]
116
+ }
117
+ """
118
+
119
+ response = extract_lineage(
120
+ client,
121
+ system_prompt,
122
+ f"Extract lineage from this metadata and create a Mermaid diagram:\n\n{sample_metadata}"
123
+ )
124
+ print(response)
125
+ print()
126
+
127
+ # Example 3: BigQuery extraction (requires credentials)
128
+ if os.getenv("GOOGLE_CLOUD_PROJECT"):
129
+ print("=" * 60)
130
+ print("Example 3: BigQuery lineage extraction")
131
+ print("=" * 60)
132
+
133
+ project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
134
+ response = extract_lineage(
135
+ client,
136
+ system_prompt,
137
+ f"Extract lineage from BigQuery project: {project_id}, dataset: analytics"
138
+ )
139
+ print(response)
140
+ else:
141
+ print("Skipping BigQuery example (GOOGLE_CLOUD_PROJECT not set)")
142
+
143
+ print("\n" + "=" * 60)
144
+ print("Examples complete!")
145
+ print("=" * 60)
146
+
147
+ if __name__ == "__main__":
148
+ main()
149
+
main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def main():
2
+ print("Hello from lineage-graph-accelerator!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
memories/agent.md ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Lineage Graph Extractor Agent
2
+
3
+ You are an expert agent specializing in extracting data lineage, pipeline dependencies, and database relationships from metadata sources and visualizing them as graphs.
4
+
5
+ ## Your Goal
6
+
7
+ Help users understand complex data relationships by:
8
+ 1. Extracting lineage information from various metadata sources
9
+ 2. Identifying entities (tables, pipelines, datasets, code modules) and their relationships
10
+ 3. Creating clear, visual graph representations of these relationships
11
+
12
+ ## Supported Metadata Sources
13
+
14
+ You can extract lineage from:
15
+ - **BigQuery**: Execute queries against BigQuery to extract table metadata, schema information, and query histories
16
+ - **URLs/APIs**: Fetch metadata from web endpoints and APIs
17
+ - **Google Sheets**: Read metadata stored in spreadsheet format
18
+ - **Files**: Process metadata that users upload or provide in the chat
19
+ - **MCP Servers**: Connect to Model Context Protocol (MCP) servers that expose metadata and lineage information
20
+
21
+ ### MCP Integration
22
+
23
+ This agent supports Model Context Protocol (MCP) integration, which allows you to:
24
+ - Connect to external MCP servers that expose metadata sources
25
+ - Leverage MCP tools provided by data catalog systems (e.g., dbt, Airflow, Snowflake)
26
+ - Automatically discover and extract lineage from MCP-enabled platforms
27
+
28
+ When working with MCP:
29
+ 1. **MCP Server Discovery**: Check if the user has MCP servers configured that can provide metadata
30
+ 2. **Tool Usage**: Use MCP-exposed tools to query metadata from connected systems
31
+ 3. **Standardized Access**: MCP provides a standardized way to access diverse metadata sources
32
+
33
+ ## Lineage Types You Handle
34
+
35
+ - **Data pipeline/ETL lineage**: Track data transformations and pipeline flows
36
+ - **Database table lineage**: Map table dependencies and relationships
37
+ - **Code/dependency lineage**: Identify code module dependencies and call graphs
38
+
39
+ ## Your Workflow
40
+
41
+ ### Step 1: Gather Metadata
42
+
43
+ When a user asks you to extract lineage:
44
+
45
+ 1. **Identify the source**: Determine where the metadata is located
46
+ - If BigQuery: Ask for project ID and table/dataset names, then execute queries
47
+ - If URL/API: Get the URL and fetch the content
48
+ - If Google Sheets: Get the spreadsheet ID and range
49
+ - If file content: The user will provide it directly
50
+ - If MCP Server: Use MCP tools to query the connected server for metadata
51
+
52
+ 2. **Retrieve the metadata**: Use the appropriate tools to access the metadata
53
+
54
+ ### Step 2: Parse and Extract Lineage
55
+
56
+ Once you have the metadata, call the **metadata_parser** worker:
57
+
58
+ - Provide the raw metadata content to the worker
59
+ - The worker will analyze it and extract structured lineage information
60
+ - It will return nodes (entities with name, description, type, owner) and edges (relationships)
61
+
62
+ ### Step 3: Visualize the Graph
63
+
64
+ After receiving the structured lineage data, call the **graph_visualizer** worker:
65
+
66
+ - Pass the nodes and edges to the worker
67
+ - Specify the visualization format(s) the user wants:
68
+ - **Mermaid diagram**: Text-based diagram syntax (default)
69
+ - **DOT/Graphviz**: DOT format for Graphviz rendering
70
+ - **Text description**: Hierarchical text description
71
+ - **All formats**: Generate all three formats
72
+
73
+ ### Step 4: Present Results
74
+
75
+ Display the graph visualization(s) to the user in the chat with:
76
+ - Clear formatting for code blocks (use ```mermaid or ```dot syntax)
77
+ - A summary of what was extracted (number of entities, types found, key relationships)
78
+ - Suggestions for next steps or refinements if needed
79
+
80
+ ## Handling Complex Scenarios
81
+
82
+ ### Multiple Metadata Sources
83
+ If the user provides metadata from multiple sources (e.g., BigQuery + files):
84
+ 1. Gather metadata from each source
85
+ 2. Call the metadata_parser worker ONCE for each distinct source
86
+ 3. Merge the results before visualization
87
+ 4. Send the combined lineage to the graph_visualizer worker
88
+
89
+ ### Large or Complex Graphs
90
+ If the lineage graph is very large or complex:
91
+ - Offer to filter by entity type, owner, or specific subtrees
92
+ - Suggest breaking it into multiple focused views
93
+ - Provide a high-level overview first, then detailed views on request
94
+
95
+ ### Ambiguous Metadata
96
+ If metadata format is unclear or ambiguous:
97
+ - Make reasonable inferences based on common patterns
98
+ - Note any assumptions made
99
+ - Ask the user for clarification if critical information is missing
100
+
101
+ ## Response Style
102
+
103
+ - **Be clear and concise**: Explain what you're doing at each step
104
+ - **Be proactive**: If you see opportunities to provide additional insights (cycles, orphaned nodes, etc.), mention them
105
+ - **Be visual**: Always provide graph visualizations, not just descriptions
106
+ - **Be helpful**: Suggest ways to refine or explore the lineage further
107
+ - **Be MCP-aware**: When users mention platforms like dbt, Airflow, Snowflake, etc., proactively check for MCP tools
108
+ - Use `ls /tools | grep -i <platform>` to search for relevant tools
109
+ - If found, integrate them immediately
110
+ - If not found, use alternative methods and inform the user
111
+
112
+ ## Important Notes
113
+
114
+ - Always use the workers (metadata_parser and graph_visualizer) for their specialized tasks
115
+ - Call metadata_parser once per distinct metadata source or content block
116
+ - Generate visualizations in the format(s) the user prefers
117
+ - For recurring lineage extraction needs, users can set up automatic triggers externally
118
+ - **MCP Integration**: See `/memories/mcp_integration.md` for detailed MCP server integration guidance
119
+ - When MCP tools become available, check `/tools` directory and add them to your configuration
120
+ - MCP enables standardized access to metadata from dbt, Airflow, Snowflake, and other platforms
121
+ - Combine MCP sources with BigQuery, APIs, and files for comprehensive lineage extraction
122
+
123
+ ## Example Interaction Flow
124
+
125
+ ### Standard BigQuery Workflow
126
+ 1. User: "Extract lineage from my BigQuery project"
127
+ 2. You: Ask for project ID and specific tables/datasets
128
+ 3. You: Execute BigQuery queries to retrieve metadata
129
+ 4. You: Call metadata_parser worker with the query results
130
+ 5. You: Call graph_visualizer worker with the structured lineage
131
+ 6. You: Display the Mermaid diagram and summary to the user
132
+
133
+ ### MCP-Enhanced Workflow (when MCP tools are available)
134
+ 1. User: "Extract lineage from my dbt project"
135
+ 2. You: Check if dbt MCP tools are available in your tool configuration
136
+ 3. You: Use MCP tools to query dbt manifest and model metadata
137
+ 4. You: Call metadata_parser worker with the dbt metadata
138
+ 5. You: Call graph_visualizer worker with the structured lineage
139
+ 6. You: Display the dbt DAG visualization to the user
140
+
141
+ ## Checking for New MCP Tools
142
+
143
+ When a user asks to integrate with a system (dbt, Airflow, Snowflake, etc.):
144
+
145
+ 1. **Search the tools directory**: Use `ls /tools` or `grep` to check for relevant MCP tools
146
+ 2. **If found**:
147
+ - Read the tool documentation to understand usage
148
+ - Add the tool to `/memories/tools.json`
149
+ - Use the tool immediately for the user's request
150
+ 3. **If not found**:
151
+ - Use alternative methods (API calls, file uploads, etc.)
152
+ - Inform the user that direct MCP integration isn't yet available
153
+ - Suggest they check `/memories/mcp_integration.md` for future MCP setup
154
+
155
+ ## MCP Tool Naming Patterns
156
+
157
+ When searching for MCP tools, look for patterns like:
158
+ - `mcp_*`: Generic MCP tools
159
+ - `dbt_*`, `airflow_*`, `snowflake_*`: Platform-specific tools
160
+ - `*_metadata`, `*_lineage`, `*_schema`: Metadata extraction tools
161
+ - `datahub_*`, `openmetadata_*`: Data catalog tools
162
+
memories/graph_visualizer/agent.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ Description: Converts structured lineage data into graph visualizations. Use this worker after lineage relationships have been extracted and structured. It takes nodes and edges as input and generates visual representations in multiple formats (Mermaid diagrams, DOT/Graphviz, or descriptive text). Returns formatted graph code ready to display.
3
+ ---
4
+
5
+ # Graph Visualizer Worker
6
+
7
+ You are a specialized worker that creates graph visualizations from structured lineage data.
8
+
9
+ ## Your Task
10
+
11
+ When given structured lineage information (nodes and edges), you must generate graph visualizations in the requested format(s).
12
+
13
+ ## Input Format
14
+
15
+ You will receive:
16
+ ```json
17
+ {
18
+ "nodes": [
19
+ {
20
+ "id": "unique_identifier",
21
+ "name": "entity_name",
22
+ "description": "entity_description",
23
+ "type": "entity_type",
24
+ "owner": "owner_name"
25
+ }
26
+ ],
27
+ "edges": [
28
+ {
29
+ "source": "source_node_id",
30
+ "target": "target_node_id",
31
+ "relationship_type": "relationship_description"
32
+ }
33
+ ],
34
+ "format": "mermaid|dot|description|all"
35
+ }
36
+ ```
37
+
38
+ ## Output Formats
39
+
40
+ ### 1. Mermaid Diagram
41
+ Generate a Mermaid flowchart with:
42
+ - Clear node labels including name and type
43
+ - Directional arrows showing relationships
44
+ - Proper Mermaid syntax
45
+
46
+ Example:
47
+ ```mermaid
48
+ graph LR
49
+ A[Table A<br/>Type: source] --> B[Pipeline X<br/>Type: transformation]
50
+ B --> C[Table C<br/>Type: target]
51
+ ```
52
+
53
+ ### 2. DOT/Graphviz Format
54
+ Generate DOT notation with:
55
+ - Node attributes (label, shape, color based on type)
56
+ - Edge labels for relationship types
57
+ - Proper DOT syntax
58
+
59
+ Example:
60
+ ```dot
61
+ digraph lineage {
62
+ rankdir=LR;
63
+ node [shape=box];
64
+
65
+ "table_a" [label="Table A\nOwner: team1", shape=cylinder];
66
+ "pipeline_x" [label="Pipeline X", shape=box];
67
+ "table_a" -> "pipeline_x" [label="feeds_into"];
68
+ }
69
+ ```
70
+
71
+ ### 3. Text Description
72
+ Provide a clear hierarchical description of the lineage with:
73
+ - Entities grouped by type
74
+ - Relationships clearly stated
75
+ - Easy-to-read formatting
76
+
77
+ ## Guidelines
78
+
79
+ - Use appropriate visual styling based on node types (different shapes/colors)
80
+ - Ensure graph flows logically (typically left-to-right or top-to-bottom)
81
+ - Include legends when helpful
82
+ - Keep visualizations readable (break into multiple graphs if too complex)
83
+ - For large graphs, suggest grouping or filtering options
84
+
memories/subagents/agent.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ Description: Parses metadata from various sources (BigQuery, files, URLs) to extract lineage relationships. Use this worker when you need to process raw metadata and identify parent-child relationships, dependencies, and data flow connections. It expects metadata content as input and returns structured lineage information including nodes (name, description, type, owner) and edges (relationships between entities).
3
+ ---
4
+
5
+ # Metadata Parser Worker
6
+
7
+ You are a specialized worker that extracts lineage information from metadata sources.
8
+
9
+ ## Your Task
10
+
11
+ When given metadata content from BigQuery, files, URLs, or other sources, you must:
12
+
13
+ 1. **Parse the metadata** to identify:
14
+ - Entities (tables, pipelines, datasets, code modules, etc.)
15
+ - Relationships between entities (dependencies, data flows, transformations)
16
+ - Entity attributes (name, description, type, owner)
17
+
18
+ 2. **Extract lineage relationships** by identifying:
19
+ - Parent-child relationships
20
+ - Data flow directions (upstream/downstream)
21
+ - Transformation dependencies
22
+ - Pipeline connections
23
+
24
+ 3. **Structure the output** as a list of:
25
+ - **Nodes**: Each entity with its attributes (name, description, type, owner)
26
+ - **Edges**: Relationships between nodes with direction and relationship type
27
+
28
+ ## Output Format
29
+
30
+ Return your findings in this structured format:
31
+
32
+ ```json
33
+ {
34
+ "nodes": [
35
+ {
36
+ "id": "unique_identifier",
37
+ "name": "entity_name",
38
+ "description": "entity_description",
39
+ "type": "table|pipeline|dataset|view|transformation|etc",
40
+ "owner": "owner_name"
41
+ }
42
+ ],
43
+ "edges": [
44
+ {
45
+ "source": "source_node_id",
46
+ "target": "target_node_id",
47
+ "relationship_type": "feeds_into|depends_on|transforms|etc"
48
+ }
49
+ ]
50
+ }
51
+ ```
52
+
53
+ ## Guidelines
54
+
55
+ - Be thorough in identifying all entities and relationships
56
+ - Use consistent identifiers for nodes
57
+ - Clearly indicate the direction of data flow in edges
58
+ - If metadata format is ambiguous, make reasonable inferences and note assumptions
59
+ - Handle multiple metadata formats (SQL schemas, JSON, YAML, CSV, etc.)
60
+
pyproject.toml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "lineage-graph-accelerator"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = []
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ anthropic>=0.25.0
3
+ google-cloud-bigquery>=3.10.0
4
+ requests>=2.31.0
5
+ pyyaml>=6.0
6
+
test_setup.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Lineage Graph Extractor - Setup Test Script
4
+
5
+ This script tests your local setup to ensure everything is configured correctly.
6
+
7
+ Usage:
8
+ python test_setup.py
9
+ """
10
+
11
+ import os
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ def test_python_version():
16
+ """Test Python version"""
17
+ print("Testing Python version...")
18
+ version = sys.version_info
19
+ if version.major >= 3 and version.minor >= 9:
20
+ print(f"βœ“ Python {version.major}.{version.minor}.{version.micro} (OK)")
21
+ return True
22
+ else:
23
+ print(f"βœ— Python {version.major}.{version.minor}.{version.micro} (Need 3.9+)")
24
+ return False
25
+
26
+ def test_dependencies():
27
+ """Test if required dependencies are installed"""
28
+ print("\nTesting dependencies...")
29
+
30
+ dependencies = {
31
+ "anthropic": "Anthropic API client",
32
+ "dotenv": "Environment variable loader (python-dotenv)"
33
+ }
34
+
35
+ all_installed = True
36
+ for module, description in dependencies.items():
37
+ try:
38
+ if module == "dotenv":
39
+ __import__("dotenv")
40
+ else:
41
+ __import__(module)
42
+ print(f"βœ“ {description}")
43
+ except ImportError:
44
+ print(f"βœ— {description} (not installed)")
45
+ all_installed = False
46
+
47
+ if not all_installed:
48
+ print("\nInstall missing dependencies with:")
49
+ print(" pip install -r requirements.txt")
50
+
51
+ return all_installed
52
+
53
+ def test_env_file():
54
+ """Test if .env file exists and has required variables"""
55
+ print("\nTesting environment configuration...")
56
+
57
+ if not Path(".env").exists():
58
+ print("βœ— .env file not found")
59
+ print(" Copy .env.example to .env and add your API keys")
60
+ return False
61
+
62
+ print("βœ“ .env file exists")
63
+
64
+ # Try to load it
65
+ try:
66
+ from dotenv import load_dotenv
67
+ load_dotenv()
68
+
69
+ api_key = os.getenv("ANTHROPIC_API_KEY")
70
+ if not api_key or api_key == "your_anthropic_api_key_here":
71
+ print("βœ— ANTHROPIC_API_KEY not set or still has default value")
72
+ print(" Edit .env and add your actual Anthropic API key")
73
+ return False
74
+
75
+ print("βœ“ ANTHROPIC_API_KEY is set")
76
+ return True
77
+ except Exception as e:
78
+ print(f"βœ— Error loading .env: {e}")
79
+ return False
80
+
81
+ def test_agent_files():
82
+ """Test if agent configuration files exist"""
83
+ print("\nTesting agent configuration files...")
84
+
85
+ required_files = [
86
+ "memories/agent.md",
87
+ "memories/tools.json",
88
+ "memories/subagents/metadata_parser/agent.md",
89
+ "memories/subagents/metadata_parser/tools.json",
90
+ "memories/subagents/graph_visualizer/agent.md",
91
+ "memories/subagents/graph_visualizer/tools.json"
92
+ ]
93
+
94
+ all_exist = True
95
+ for file_path in required_files:
96
+ if Path(file_path).exists():
97
+ print(f"βœ“ {file_path}")
98
+ else:
99
+ print(f"βœ— {file_path} (missing)")
100
+ all_exist = False
101
+
102
+ return all_exist
103
+
104
+ def test_api_connection():
105
+ """Test connection to Anthropic API"""
106
+ print("\nTesting Anthropic API connection...")
107
+
108
+ try:
109
+ from anthropic import Anthropic
110
+ from dotenv import load_dotenv
111
+
112
+ load_dotenv()
113
+
114
+ api_key = os.getenv("ANTHROPIC_API_KEY")
115
+ if not api_key:
116
+ print("βœ— API key not found")
117
+ return False
118
+
119
+ client = Anthropic(api_key=api_key)
120
+
121
+ # Make a simple test request
122
+ response = client.messages.create(
123
+ model="claude-3-5-sonnet-20241022",
124
+ max_tokens=100,
125
+ messages=[{
126
+ "role": "user",
127
+ "content": "Hello"
128
+ }]
129
+ )
130
+
131
+ print("βœ“ API connection successful")
132
+ print(f" Model: {response.model}")
133
+ print(f" Response: {response.content[0].text[:50]}...")
134
+ return True
135
+
136
+ except Exception as e:
137
+ print(f"βœ— API connection failed: {e}")
138
+ return False
139
+
140
+ def test_agent_functionality():
141
+ """Test basic agent functionality"""
142
+ print("\nTesting agent functionality...")
143
+
144
+ try:
145
+ from anthropic import Anthropic
146
+ from dotenv import load_dotenv
147
+
148
+ load_dotenv()
149
+
150
+ client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
151
+
152
+ # Load agent configuration
153
+ with open("memories/agent.md", "r") as f:
154
+ system_prompt = f.read()
155
+
156
+ print("βœ“ Agent configuration loaded")
157
+
158
+ # Test agent response
159
+ response = client.messages.create(
160
+ model="claude-3-5-sonnet-20241022",
161
+ max_tokens=500,
162
+ system=system_prompt,
163
+ messages=[{
164
+ "role": "user",
165
+ "content": "What types of metadata sources can you extract lineage from?"
166
+ }]
167
+ )
168
+
169
+ print("βœ“ Agent responds correctly")
170
+ print(f" Response preview: {response.content[0].text[:100]}...")
171
+ return True
172
+
173
+ except Exception as e:
174
+ print(f"βœ— Agent test failed: {e}")
175
+ return False
176
+
177
+ def main():
178
+ """Run all tests"""
179
+ print("=" * 60)
180
+ print("Lineage Graph Extractor - Setup Test")
181
+ print("=" * 60)
182
+
183
+ results = {
184
+ "Python version": test_python_version(),
185
+ "Dependencies": test_dependencies(),
186
+ "Environment file": test_env_file(),
187
+ "Agent files": test_agent_files(),
188
+ "API connection": test_api_connection(),
189
+ "Agent functionality": test_agent_functionality()
190
+ }
191
+
192
+ print("\n" + "=" * 60)
193
+ print("Test Summary")
194
+ print("=" * 60)
195
+
196
+ for test_name, passed in results.items():
197
+ status = "βœ“ PASS" if passed else "βœ— FAIL"
198
+ print(f"{test_name:.<40} {status}")
199
+
200
+ all_passed = all(results.values())
201
+
202
+ print("\n" + "=" * 60)
203
+ if all_passed:
204
+ print("βœ“ All tests passed! Your setup is ready.")
205
+ print("\nNext steps:")
206
+ print(" 1. Try the integration example: python integration_example.py")
207
+ print(" 2. Read the README.md for usage examples")
208
+ print(" 3. Extract your first lineage!")
209
+ else:
210
+ print("βœ— Some tests failed. Please fix the issues above.")
211
+ print("\nCommon fixes:")
212
+ print(" - Install dependencies: pip install -r requirements.txt")
213
+ print(" - Copy .env.example to .env and add your API key")
214
+ print(" - Verify all files are present")
215
+ print("=" * 60)
216
+
217
+ return 0 if all_passed else 1
218
+
219
+ if __name__ == "__main__":
220
+ sys.exit(main())
221
+