Commit
·
ffe0724
1
Parent(s):
b304992
Add Google Gemini AI Assistant chatbot
Browse filesFeatures:
- AI Assistant tab powered by Google Gemini (gemini-1.5-flash)
- Natural language lineage generation from pipeline descriptions
- Auto-extract JSON from AI responses
- One-click transfer of generated JSON to lineage tool
- Updated hero section and footer with AI Assistant info
Sponsor integration: Google Gemini API
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <[email protected]>
- README.md +1 -1
- app.py +209 -0
- requirements.txt +1 -0
README.md
CHANGED
|
@@ -290,7 +290,7 @@ python test_setup.py
|
|
| 290 |
## 🔜 Roadmap
|
| 291 |
|
| 292 |
- [x] Gradio 6 upgrade for enhanced UI components
|
| 293 |
-
- [
|
| 294 |
- [x] Apache Atlas export support
|
| 295 |
- [ ] File upload functionality
|
| 296 |
- [x] Graph export as PNG/SVG
|
|
|
|
| 290 |
## 🔜 Roadmap
|
| 291 |
|
| 292 |
- [x] Gradio 6 upgrade for enhanced UI components
|
| 293 |
+
- [x] Agentic chatbot for natural language queries (Google Gemini)
|
| 294 |
- [x] Apache Atlas export support
|
| 295 |
- [ ] File upload functionality
|
| 296 |
- [x] Graph export as PNG/SVG
|
app.py
CHANGED
|
@@ -22,6 +22,13 @@ try:
|
|
| 22 |
except ImportError:
|
| 23 |
EXPORTERS_AVAILABLE = False
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# ============================================================================
|
| 26 |
# Constants and Configuration
|
| 27 |
# ============================================================================
|
|
@@ -773,6 +780,127 @@ def extract_lineage_from_url(
|
|
| 773 |
return render_mermaid(viz), f"Lineage from URL: {url or 'not specified'}"
|
| 774 |
|
| 775 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
# ============================================================================
|
| 777 |
# Gradio UI
|
| 778 |
# ============================================================================
|
|
@@ -797,6 +925,7 @@ with gr.Blocks(
|
|
| 797 |
| **Visualize** | Generate interactive Mermaid diagrams with color-coded nodes and relationship labels |
|
| 798 |
| **Export** | Export to enterprise data catalogs: OpenLineage, Collibra, Purview, Alation, Atlas |
|
| 799 |
| **MCP Integration** | Connect to MCP servers for AI-powered metadata extraction |
|
|
|
|
| 800 |
|
| 801 |
### Quick Start
|
| 802 |
|
|
@@ -1046,6 +1175,85 @@ with gr.Blocks(
|
|
| 1046 |
outputs=[demo_viz, demo_summary]
|
| 1047 |
)
|
| 1048 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
# Footer
|
| 1050 |
gr.Markdown("""
|
| 1051 |
---
|
|
@@ -1058,6 +1266,7 @@ with gr.Blocks(
|
|
| 1058 |
| **Collibra** | Collibra Data Intelligence | Enterprise data governance |
|
| 1059 |
| **Purview** | Microsoft Purview | Azure ecosystem |
|
| 1060 |
| **Alation** | Alation Data Catalog | Self-service analytics |
|
|
|
|
| 1061 |
|
| 1062 |
---
|
| 1063 |
|
|
|
|
| 22 |
except ImportError:
|
| 23 |
EXPORTERS_AVAILABLE = False
|
| 24 |
|
| 25 |
+
# Import Google Gemini for agentic chatbot
|
| 26 |
+
try:
|
| 27 |
+
import google.generativeai as genai
|
| 28 |
+
GEMINI_AVAILABLE = True
|
| 29 |
+
except ImportError:
|
| 30 |
+
GEMINI_AVAILABLE = False
|
| 31 |
+
|
| 32 |
# ============================================================================
|
| 33 |
# Constants and Configuration
|
| 34 |
# ============================================================================
|
|
|
|
| 780 |
return render_mermaid(viz), f"Lineage from URL: {url or 'not specified'}"
|
| 781 |
|
| 782 |
|
| 783 |
+
# ============================================================================
|
| 784 |
+
# Gemini Agentic Chatbot
|
| 785 |
+
# ============================================================================
|
| 786 |
+
|
| 787 |
+
LINEAGE_AGENT_PROMPT = """You are a Data Lineage Assistant powered by the Lineage Graph Accelerator tool.
|
| 788 |
+
You help users understand, extract, and visualize data lineage from various sources.
|
| 789 |
+
|
| 790 |
+
Your capabilities:
|
| 791 |
+
1. **Extract Lineage**: Parse metadata from dbt manifests, Airflow DAGs, SQL DDL, and custom JSON
|
| 792 |
+
2. **Explain Lineage**: Help users understand data flow and dependencies
|
| 793 |
+
3. **Generate Metadata**: Create lineage JSON from natural language descriptions
|
| 794 |
+
4. **Export Guidance**: Advise on exporting to data catalogs (OpenLineage, Collibra, Purview, Alation, Atlas)
|
| 795 |
+
|
| 796 |
+
When users describe their data pipeline, generate valid JSON lineage in this format:
|
| 797 |
+
```json
|
| 798 |
+
{
|
| 799 |
+
"nodes": [
|
| 800 |
+
{"id": "unique_id", "type": "source|table|model|view|report", "name": "Display Name"}
|
| 801 |
+
],
|
| 802 |
+
"edges": [
|
| 803 |
+
{"from": "source_id", "to": "target_id"}
|
| 804 |
+
]
|
| 805 |
+
}
|
| 806 |
+
```
|
| 807 |
+
|
| 808 |
+
Node types: source, table, model, view, report, dimension, fact, destination, task
|
| 809 |
+
|
| 810 |
+
Be helpful, concise, and always offer to generate lineage JSON when users describe data flows.
|
| 811 |
+
If the user provides metadata or describes a pipeline, generate the JSON they can paste into the tool."""
|
| 812 |
+
|
| 813 |
+
|
| 814 |
+
def init_gemini(api_key: str) -> bool:
|
| 815 |
+
"""Initialize Gemini with the provided API key."""
|
| 816 |
+
if not GEMINI_AVAILABLE:
|
| 817 |
+
return False
|
| 818 |
+
if not api_key:
|
| 819 |
+
return False
|
| 820 |
+
try:
|
| 821 |
+
genai.configure(api_key=api_key)
|
| 822 |
+
return True
|
| 823 |
+
except Exception:
|
| 824 |
+
return False
|
| 825 |
+
|
| 826 |
+
|
| 827 |
+
def chat_with_gemini(
|
| 828 |
+
message: str,
|
| 829 |
+
history: List[Dict[str, str]],
|
| 830 |
+
api_key: str
|
| 831 |
+
) -> Tuple[List[Dict[str, str]], str]:
|
| 832 |
+
"""Chat with Gemini about data lineage."""
|
| 833 |
+
if not GEMINI_AVAILABLE:
|
| 834 |
+
return history + [
|
| 835 |
+
{"role": "user", "content": message},
|
| 836 |
+
{"role": "assistant", "content": "Google Gemini is not available. Please install google-generativeai package."}
|
| 837 |
+
], ""
|
| 838 |
+
|
| 839 |
+
if not api_key:
|
| 840 |
+
return history + [
|
| 841 |
+
{"role": "user", "content": message},
|
| 842 |
+
{"role": "assistant", "content": "Please enter your Google Gemini API key to use the chatbot. You can get one at https://makersuite.google.com/app/apikey"}
|
| 843 |
+
], ""
|
| 844 |
+
|
| 845 |
+
try:
|
| 846 |
+
genai.configure(api_key=api_key)
|
| 847 |
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
| 848 |
+
|
| 849 |
+
# Build conversation history for context
|
| 850 |
+
chat_history = []
|
| 851 |
+
for msg in history:
|
| 852 |
+
role = "user" if msg.get("role") == "user" else "model"
|
| 853 |
+
chat_history.append({"role": role, "parts": [msg.get("content", "")]})
|
| 854 |
+
|
| 855 |
+
# Start chat with history
|
| 856 |
+
chat = model.start_chat(history=chat_history)
|
| 857 |
+
|
| 858 |
+
# Send message with system prompt context
|
| 859 |
+
full_prompt = f"{LINEAGE_AGENT_PROMPT}\n\nUser query: {message}"
|
| 860 |
+
response = chat.send_message(full_prompt)
|
| 861 |
+
|
| 862 |
+
assistant_message = response.text
|
| 863 |
+
|
| 864 |
+
# Extract any JSON from the response for the metadata field
|
| 865 |
+
extracted_json = ""
|
| 866 |
+
if "```json" in assistant_message:
|
| 867 |
+
try:
|
| 868 |
+
json_start = assistant_message.find("```json") + 7
|
| 869 |
+
json_end = assistant_message.find("```", json_start)
|
| 870 |
+
if json_end > json_start:
|
| 871 |
+
extracted_json = assistant_message[json_start:json_end].strip()
|
| 872 |
+
except Exception:
|
| 873 |
+
pass
|
| 874 |
+
|
| 875 |
+
new_history = history + [
|
| 876 |
+
{"role": "user", "content": message},
|
| 877 |
+
{"role": "assistant", "content": assistant_message}
|
| 878 |
+
]
|
| 879 |
+
|
| 880 |
+
return new_history, extracted_json
|
| 881 |
+
|
| 882 |
+
except Exception as e:
|
| 883 |
+
error_msg = f"Error communicating with Gemini: {str(e)}"
|
| 884 |
+
return history + [
|
| 885 |
+
{"role": "user", "content": message},
|
| 886 |
+
{"role": "assistant", "content": error_msg}
|
| 887 |
+
], ""
|
| 888 |
+
|
| 889 |
+
|
| 890 |
+
def use_generated_json(json_text: str) -> Tuple[str, str, str]:
|
| 891 |
+
"""Use the generated JSON in the lineage extractor."""
|
| 892 |
+
if not json_text.strip():
|
| 893 |
+
return "", "", "No JSON to use. Ask the chatbot to generate lineage JSON first."
|
| 894 |
+
|
| 895 |
+
try:
|
| 896 |
+
# Validate JSON
|
| 897 |
+
json.loads(json_text)
|
| 898 |
+
# Return the JSON to be used in the main tab
|
| 899 |
+
return json_text, "Custom JSON", "JSON copied to metadata input. Switch to 'Text/File Metadata' tab and click 'Extract Lineage'."
|
| 900 |
+
except json.JSONDecodeError as e:
|
| 901 |
+
return "", "", f"Invalid JSON: {str(e)}"
|
| 902 |
+
|
| 903 |
+
|
| 904 |
# ============================================================================
|
| 905 |
# Gradio UI
|
| 906 |
# ============================================================================
|
|
|
|
| 925 |
| **Visualize** | Generate interactive Mermaid diagrams with color-coded nodes and relationship labels |
|
| 926 |
| **Export** | Export to enterprise data catalogs: OpenLineage, Collibra, Purview, Alation, Atlas |
|
| 927 |
| **MCP Integration** | Connect to MCP servers for AI-powered metadata extraction |
|
| 928 |
+
| **AI Assistant** | Chat with Gemini to generate lineage from natural language descriptions |
|
| 929 |
|
| 930 |
### Quick Start
|
| 931 |
|
|
|
|
| 1175 |
outputs=[demo_viz, demo_summary]
|
| 1176 |
)
|
| 1177 |
|
| 1178 |
+
# Tab 5: AI Chatbot (Gemini)
|
| 1179 |
+
with gr.Tab("AI Assistant", id="chatbot"):
|
| 1180 |
+
gr.Markdown("""
|
| 1181 |
+
## Lineage AI Assistant (Powered by Google Gemini)
|
| 1182 |
+
|
| 1183 |
+
Ask questions about data lineage, describe your data pipeline in natural language,
|
| 1184 |
+
and get JSON metadata you can use to visualize lineage.
|
| 1185 |
+
|
| 1186 |
+
**Examples:**
|
| 1187 |
+
- "I have a PostgreSQL database that feeds into a Spark ETL job, which outputs to a Snowflake warehouse"
|
| 1188 |
+
- "Generate lineage for a dbt project with staging, intermediate, and mart layers"
|
| 1189 |
+
- "What's the best way to document column-level lineage?"
|
| 1190 |
+
""")
|
| 1191 |
+
|
| 1192 |
+
with gr.Row():
|
| 1193 |
+
with gr.Column(scale=2):
|
| 1194 |
+
gemini_api_key = gr.Textbox(
|
| 1195 |
+
label="Google Gemini API Key",
|
| 1196 |
+
placeholder="Enter your Gemini API key (get one at makersuite.google.com)",
|
| 1197 |
+
type="password",
|
| 1198 |
+
info="Your API key is not stored and only used for this session"
|
| 1199 |
+
)
|
| 1200 |
+
|
| 1201 |
+
chatbot_display = gr.Chatbot(
|
| 1202 |
+
label="Chat with Lineage AI",
|
| 1203 |
+
height=400
|
| 1204 |
+
)
|
| 1205 |
+
|
| 1206 |
+
with gr.Row():
|
| 1207 |
+
chat_input = gr.Textbox(
|
| 1208 |
+
label="Your message",
|
| 1209 |
+
placeholder="Describe your data pipeline or ask about lineage...",
|
| 1210 |
+
lines=2,
|
| 1211 |
+
scale=4
|
| 1212 |
+
)
|
| 1213 |
+
send_btn = gr.Button("Send", variant="primary", scale=1)
|
| 1214 |
+
|
| 1215 |
+
with gr.Accordion("Generated JSON (if any)", open=False):
|
| 1216 |
+
generated_json = gr.Code(
|
| 1217 |
+
label="Extracted JSON",
|
| 1218 |
+
language="json",
|
| 1219 |
+
lines=10
|
| 1220 |
+
)
|
| 1221 |
+
use_json_btn = gr.Button("Use This JSON in Lineage Tool", size="sm")
|
| 1222 |
+
json_status = gr.Textbox(label="Status", interactive=False)
|
| 1223 |
+
|
| 1224 |
+
# Chat handlers
|
| 1225 |
+
chat_state = gr.State([])
|
| 1226 |
+
|
| 1227 |
+
def handle_chat(message, history, api_key):
|
| 1228 |
+
if not message.strip():
|
| 1229 |
+
return history, "", history
|
| 1230 |
+
new_history, extracted = chat_with_gemini(message, history, api_key)
|
| 1231 |
+
return new_history, extracted, new_history
|
| 1232 |
+
|
| 1233 |
+
send_btn.click(
|
| 1234 |
+
fn=handle_chat,
|
| 1235 |
+
inputs=[chat_input, chat_state, gemini_api_key],
|
| 1236 |
+
outputs=[chatbot_display, generated_json, chat_state]
|
| 1237 |
+
).then(
|
| 1238 |
+
fn=lambda: "",
|
| 1239 |
+
outputs=[chat_input]
|
| 1240 |
+
)
|
| 1241 |
+
|
| 1242 |
+
chat_input.submit(
|
| 1243 |
+
fn=handle_chat,
|
| 1244 |
+
inputs=[chat_input, chat_state, gemini_api_key],
|
| 1245 |
+
outputs=[chatbot_display, generated_json, chat_state]
|
| 1246 |
+
).then(
|
| 1247 |
+
fn=lambda: "",
|
| 1248 |
+
outputs=[chat_input]
|
| 1249 |
+
)
|
| 1250 |
+
|
| 1251 |
+
use_json_btn.click(
|
| 1252 |
+
fn=use_generated_json,
|
| 1253 |
+
inputs=[generated_json],
|
| 1254 |
+
outputs=[metadata_input, source_type, json_status]
|
| 1255 |
+
)
|
| 1256 |
+
|
| 1257 |
# Footer
|
| 1258 |
gr.Markdown("""
|
| 1259 |
---
|
|
|
|
| 1266 |
| **Collibra** | Collibra Data Intelligence | Enterprise data governance |
|
| 1267 |
| **Purview** | Microsoft Purview | Azure ecosystem |
|
| 1268 |
| **Alation** | Alation Data Catalog | Self-service analytics |
|
| 1269 |
+
| **Atlas** | Apache Atlas | Open-source governance |
|
| 1270 |
|
| 1271 |
---
|
| 1272 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
gradio>=6.0.0
|
| 2 |
anthropic>=0.25.0
|
| 3 |
google-cloud-bigquery>=3.10.0
|
|
|
|
| 4 |
requests>=2.31.0
|
| 5 |
pyyaml>=6.0
|
|
|
|
| 1 |
gradio>=6.0.0
|
| 2 |
anthropic>=0.25.0
|
| 3 |
google-cloud-bigquery>=3.10.0
|
| 4 |
+
google-generativeai>=0.8.0
|
| 5 |
requests>=2.31.0
|
| 6 |
pyyaml>=6.0
|