Sql assistant design - v0 by Vercel

add csv files also in database schema and business req document also will be a doc or pdf file and give one more option where user can explain i am giving codes of base agent, query generation agent, schema genration agent, query_optimized and feedback agent (query_optimized and feedback agent is collab file code). integrate all this in my ui # sql_generation_agent.py from .base_agent import BaseSQLAgent from pydantic import BaseModel, ValidationError import json from typing import Optional, Dict, Any class GenerationResult(BaseModel): query: str explanation: str potential_issues: list[str] alternatives: list[str] class SQLGenerationAgent(BaseSQLAgent): SYSTEM_PROMPT = """ You are a SQL expert. Follow these rules: ## **Supported Features ✅** **Use the following features in queries: for SPARK only** - **Date Functions:** `date_trunc`, `date_add`, `current_date`, `unix_timestamp` - **Filtering:** `WHERE`, `BETWEEN` - **Aggregations:** `GROUP BY`, `HAVING` - **Joins:** `INNER, LEFT, RIGHT, FULL OUTER JOIN` - **CTEs (WITH clause):** Use for complex queries - **Window Functions:** `RANK(), ROW_NUMBER(), LEAD(), LAG()` - **JSON Handling:** `get_json_object(), json_tuple()` - **Bucketing & Partitioning:** `PARTITIONED BY`, `CLUSTERED BY` **DO NOT generate queries using the following: for TRINO only** `CREATE INDEX` (Use **partitioning** instead) `CREATE MATERIALIZED VIEW` (Use **regular views** instead) `MERGE INTO` (Use `INSERT INTO ... SELECT` instead) `UPDATE/DELETE` (Limited support; Use **CTAS or INSERT INTO ... SELECT** instead) `AUTO_INCREMENT` (Use `UUID()` or `ROW_NUMBER()`) `BEGIN TRANSACTION` (Use **ETL pipelines** instead) `CREATE PROCEDURE / TRIGGER` (Use **external orchestration** tools like Airflow) 1. Return JSON with EXACTLY these fields: - query: Valid SQL string - explanation: Technical rationale - potential_issues: List of strings - alternatives: List of Top 3 optimized SQL queries using CTEs, indexing, partitioning, etc. in markdown format 2. Use this schema: {schema} # ✅ Proper variable 3. Use this SQL-Type (Trino / Spark): {sql_type} # ✅ Proper variable 4. Example response: {{ "query": "SELECT id FROM users", # ✅ No stray formatting "explanation": "Simple ID retrieval", "potential_issues": ["No index on id"], "alternatives": [```SELECT user_id FROM accounts```, ```SELECT id FROM users WHERE created_at > '2023-01-01'```]"] }} """ def __init__(self): super().__init__(self.SYSTEM_PROMPT) def generate_query(self, question: str, schema: str, sql_type: str) -> Dict[str, Any]: result: Optional[dict] = None # Select the correct FAISS index faiss_index = self.select_faiss_index(sql_type) # Retrieve relevant documentation snippets documentation_snippets = self.documentation_search(faiss_index, question) try: result = self.agent_executor.invoke({ "query": question, "schema": schema, "sql_type": sql_type, "docs": documentation_snippets # Pass documentation search results }) if not result: return {"error": "Empty response from SQL agent"} return self._parse_result(result) except Exception as e: return { "error": f"Execution failed: {str(e)}", "raw_response": str(result) if result else "No response generated" } def select_faiss_index(self, sql_type: str): return "trino_faiss_index" if sql_type.lower() == "trino" else "spark_faiss_index" def documentation_search(self, faiss_index: str, query: str): """ Searches documentation for SQL-specific syntax related to the query. Args: faiss_index (str): The FAISS index for the SQL dialect. query (str): The natural language query. Returns: list: Relevant documentation snippets. """ return self.use_tool("DocumentationSearch", {"index": faiss_index, "query": query}) def use_tool(self, tool_name: str, params: dict): """ Uses a tool (e.g., FAISS documentation search) if available. Args: tool_name (str): Name of the tool to use. params (dict): Parameters for the tool. Returns: list: Output from the tool. """ if tool_name in self.tools: return self.tools[tool_name].invoke(params) return [] def _parse_result(self, result: dict) -> Dict[str, Any]: if "output" not in result: return {"error": "Missing 'output' in agent response"} raw_output = result["output"] try: parsed = json.loads(raw_output) validated = GenerationResult(**parsed) return validated.dict() except json.JSONDecodeError as e: return { "error": f"Invalid JSON: {str(e)}", "raw_response": raw_output } except ValidationError as e: return { "error": f"Validation errors: {e.errors()}", "raw_response": raw_output },,# sql_generation_agent.py from .base_agent import BaseSQLAgent from pydantic import BaseModel, ValidationError import json from typing import Optional, Dict, Any class GenerationResult(BaseModel): query: str explanation: str potential_issues: list[str] alternatives: list[str] class SQLGenerationAgent(BaseSQLAgent): SYSTEM_PROMPT = """ You are a SQL expert. Follow these rules: ## **Supported Features ✅** **Use the following features in queries: for SPARK only** - **Date Functions:** `date_trunc`, `date_add`, `current_date`, `unix_timestamp` - **Filtering:** `WHERE`, `BETWEEN` - **Aggregations:** `GROUP BY`, `HAVING` - **Joins:** `INNER, LEFT, RIGHT, FULL OUTER JOIN` - **CTEs (WITH clause):** Use for complex queries - **Window Functions:** `RANK(), ROW_NUMBER(), LEAD(), LAG()` - **JSON Handling:** `get_json_object(), json_tuple()` - **Bucketing & Partitioning:** `PARTITIONED BY`, `CLUSTERED BY` **DO NOT generate queries using the following: for TRINO only** `CREATE INDEX` (Use **partitioning** instead) `CREATE MATERIALIZED VIEW` (Use **regular views** instead) `MERGE INTO` (Use `INSERT INTO ... SELECT` instead) `UPDATE/DELETE` (Limited support; Use **CTAS or INSERT INTO ... SELECT** instead) `AUTO_INCREMENT` (Use `UUID()` or `ROW_NUMBER()`) `BEGIN TRANSACTION` (Use **ETL pipelines** instead) `CREATE PROCEDURE / TRIGGER` (Use **external orchestration** tools like Airflow) 1. Return JSON with EXACTLY these fields: - query: Valid SQL string - explanation: Technical rationale - potential_issues: List of strings - alternatives: List of Top 3 optimized SQL queries using CTEs, indexing, partitioning, etc. in markdown format 2. Use this schema: {schema} # ✅ Proper variable 3. Use this SQL-Type (Trino / Spark): {sql_type} # ✅ Proper variable 4. Example response: {{ "query": "SELECT id FROM users", # ✅ No stray formatting "explanation": "Simple ID retrieval", "potential_issues": ["No index on id"], "alternatives": [```SELECT user_id FROM accounts```, ```SELECT id FROM users WHERE created_at > '2023-01-01'```]"] }} """ def __init__(self): super().__init__(self.SYSTEM_PROMPT) def generate_query(self, question: str, schema: str, sql_type: str) -> Dict[str, Any]: result: Optional[dict] = None # Select the correct FAISS index faiss_index = self.select_faiss_index(sql_type) # Retrieve relevant documentation snippets documentation_snippets = self.documentation_search(faiss_index, question) try: result = self.agent_executor.invoke({ "query": question, "schema": schema, "sql_type": sql_type, "docs": documentation_snippets # Pass documentation search results }) if not result: return {"error": "Empty response from SQL agent"} return self._parse_result(result) except Exception as e: return { "error": f"Execution failed: {str(e)}", "raw_response": str(result) if result else "No response generated" } def select_faiss_index(self, sql_type: str): return "trino_faiss_index" if sql_type.lower() == "trino" else "spark_faiss_index" def documentation_search(self, faiss_index: str, query: str): """ Searches documentation for SQL-specific syntax related to the query. Args: faiss_index (str): The FAISS index for the SQL dialect. query (str): The natural language query. Returns: list: Relevant documentation snippets. """ return self.use_tool("DocumentationSearch", {"index": faiss_index, "query": query}) def use_tool(self, tool_name: str, params: dict): """ Uses a tool (e.g., FAISS documentation search) if available. Args: tool_name (str): Name of the tool to use. params (dict): Parameters for the tool. Returns: list: Output from the tool. """ if tool_name in self.tools: return self.tools[tool_name].invoke(params) return [] def _parse_result(self, result: dict) -> Dict[str, Any]: if "output" not in result: return {"error": "Missing 'output' in agent response"} raw_output = result["output"] try: parsed = json.loads(raw_output) validated = GenerationResult(**parsed) return validated.dict() except json.JSONDecodeError as e: return { "error": f"Invalid JSON: {str(e)}", "raw_response": raw_output } except ValidationError as e: return { "error": f"Validation errors: {e.errors()}", "raw_response": raw_output },# src/agents/schema_designer_agent.py import json import re from typing import Dict, Any from groq import Groq from config.settings import settings class SchemaDesignerAgent: def __init__(self, groq_api_key: str = settings.GROQ_API_KEY): self.client = Groq(api_key=groq_api_key) self.system_prompt = """You are a senior database architect specializing in OLAP systems and dimensional modeling. Your task is to design optimized, normalized schemas based on business requirements. Guidelines: 1. Follow star schema or snowflake schema patterns 2. Identify clear fact and dimension tables 3. Ensure proper primary/foreign key relationships 4. Optimize for analytical queries 5. Include appropriate data types and constraints 6. Consider partitioning strategies for large tables 7. Suggest appropriate indexes IMPORTANT: Your response MUST be valid JSON following this exact structure: { "schema_name": "string", "tables": [ { "name": "string", "type": "fact|dimension", "description": "string", "columns": [ { "name": "string", "type": "string", "is_pk": boolean, "is_fk": boolean, "description": "string" } ], "relationships": [ { "from": "string", "to": "string", "type": "string" } ], "partition_key": "string", "indexes": ["string"] } ], "recommendations": ["string"] }""" def _clean_json_response(self, response: str) -> str: """Clean and prepare JSON response for parsing""" # Remove markdown code blocks if present response = re.sub(r'```json\n?', '', response) response = re.sub(r'```\n?', '', response) # Fix common JSON issues response = response.strip() # Ensure the response starts and ends with braces if not response.startswith('{'): response = '{' + response if not response.endswith('}'): response = response + '}' return response def generate_schema(self, business_description: str) -> Dict[str, Any]: """Generate optimized schema from business description""" try: response = self.client.chat.completions.create( model="qwen-2.5-coder-32b", # Using a more reliable model messages=[ { "role": "system", "content": self.system_prompt }, { "role": "user", "content": f"Business requirements:\n{business_description}" } ], temperature=0.3, response_format={"type": "json_object"} ) content = response.choices[0].message.content cleaned_content = self._clean_json_response(content) schema = json.loads(cleaned_content) # Basic validation if not isinstance(schema, dict): raise ValueError("Response is not a JSON object") if "tables" not in schema: raise ValueError("Missing required 'tables' field in response") return schema except json.JSONDecodeError as e: return { "error": "Failed to parse schema response", "details": str(e), "raw_response": content if 'content' in locals() else None } except Exception as e: return { "error": f"Error generating schema: {str(e)}", "type": type(e).__name__ },#base_agent.py from langchain.agents import AgentExecutor, Tool from langchain.agents.format_scratchpad.openai_functions import format_to_openai_function_messages from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_core.runnables import RunnablePassthrough from src.core.rag.vector_store import VectorStoreManager from src.core.llm.groq_client import GroqClient from typing import List, Optional, Dict, Any import logging class BaseSQLAgent: def __init__(self, system_prompt: str, verbose: bool = True, max_iterations: int = 5): self.vector_store = VectorStoreManager() self.tools = self._initialize_tools() self.llm = GroqClient().llm.bind_tools(self.tools) self.system_prompt = system_prompt self.verbose = verbose self.max_iterations = max_iterations self.agent_executor = self._create_agent() def _initialize_tools(self) -> List[Tool]: return [ Tool( name="DocumentationSearch", func=self._search_docs, description="Access SQL documentation for syntax verification" ) ] def _search_docs(self, query: str, sql_type: str) -> str: try: # Determine the appropriate FAISS index based on sql_type faiss_index = "trino_faiss_index" if sql_type.lower() == "trino" else "spark_faiss_index" # Load the selected FAISS index db = self.vector_store.load_vector_store(faiss_index) # Perform similarity search (fetch top 3 relevant documents) docs = db.similarity_search(query, k=3) # Return the retrieved documentation content return "\n\n".join([d.page_content for d in docs]) except Exception as e: logging.error(f"Document search failed: {str(e)}") return "Documentation unavailable" def _create_agent(self) -> AgentExecutor: prompt = ChatPromptTemplate.from_messages([ ("system", self.system_prompt), ("user", "Schema: {schema}\nQuery: {query}"), MessagesPlaceholder(variable_name="agent_scratchpad"), ]) agent = ( RunnablePassthrough.assign( agent_scratchpad=lambda x: self._format_scratchpad( x.get("intermediate_steps", []) ) ) | prompt | self.llm | OpenAIFunctionsAgentOutputParser() ) return AgentExecutor( agent=agent, tools=self.tools, verbose=self.verbose, max_iterations=self.max_iterations, handle_parsing_errors=self._handle_parsing_error, ) def _format_scratchpad(self, intermediate_steps) -> list: try: return format_to_openai_function_messages(intermediate_steps) except (KeyError, TypeError) as e: logging.warning(f"Scratchpad formatting error: {str(e)}") return [] def _handle_parsing_error(self, error: Exception) -> Dict[str, Any]: logging.error(f"Parsing error: {str(error)}") return { "output": { "error": f"Parsing error: {str(error)}", "raw_response": str(error) } }, query_optimized and feedback agent #extracting schema from csv files code import pandas as pd import sqlite3 import re import unicodedata from transformers import pipeline from pyspark.sql import SparkSession import sqlglot import glob # ======================== # 1. DATA LOADING & CLEANING # ======================== def clean_column_names(df): """Convert DataFrame column names to SQL-friendly names.""" df.columns = ( df.columns .str.strip() .str.lower() .str.replace(r"\s+", "_", regex=True) .str.replace(r"[^a-z0-9_]", "", regex=True) .str.replace(r"_+", "_", regex=True) ) df.columns = [f"column_{i}" if col == "" else col for i, col in enumerate(df.columns, 1)] return df # ✅ Load and clean multiple CSV files csv_files = glob.glob("*.csv") # Load all CSV files in the current directory conn = sqlite3.connect(":memory:") # Initialize SQLite in-memory database dataframes = {} # Store DataFrames for Spark execution if not csv_files: print("❌ No CSV files found.") exit() for csv_file in csv_files: try: table_name = re.sub(r"[^a-zA-Z0-9]", "_", csv_file.split(".")[0]) # Clean table name df = pd.read_csv(csv_file) df = clean_column_names(df) df.to_sql(table_name, conn, index=False, if_exists="replace") dataframes[table_name] = df print(f"✅ {csv_file} loaded successfully as table '{table_name}'.") except Exception as e: print(f"❌ Error loading {csv_file}: {e}") exit() # ======================== # 2. DATABASE SCHEMA EXTRACTION # ======================== def get_schema(conn): cursor = conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") tables = cursor.fetchall() schema_info = "Database Schema:\n" for table in tables: table_name = table[0] cursor.execute(f"PRAGMA table_info({table_name});") columns = cursor.fetchall() schema_info += f"Table '{table_name}' has columns: " schema_info += ", ".join([f"{col[1]} ({col[2]})" for col in columns]) + "\n" return schema_info schema_info = get_schema(conn) print(f"\n✅ Database initialized. Schema:\n{schema_info}")

Show full message

391d ago