beforeLocal

2026-05-05 00:47:39 +00:00
parent fdf2cf3659
commit 70ee32efdd
31 changed files with 1169 additions and 0 deletions
--- a/api/Dockerfile
+++ b/api/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install system dependencies for building psycopg2 and others
+RUN apt-get update && apt-get install -y \
+    gcc \
+    libpq-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+RUN chmod +x run.sh
+
+ENTRYPOINT ["./run.sh"]
--- a/api/database.py
+++ b/api/database.py
@@ -0,0 +1,44 @@
+import os
+from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, text
+from sqlalchemy.orm import declarative_base, sessionmaker
+from pgvector.sqlalchemy import Vector
+
+POSTGRES_USER = os.environ.get("POSTGRES_USER", "allmail")
+POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD", "postgres")
+POSTGRES_DB = os.environ.get("POSTGRES_DB", "emails_db")
+DB_HOST = os.environ.get("DB_HOST", "localhost")
+
+DATABASE_URL = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{DB_HOST}/{POSTGRES_DB}"
+
+engine = create_engine(DATABASE_URL)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+Base = declarative_base()
+
+# Gemini gemini-embedding-001 with output_dimensionality=768
+EMBEDDING_DIMENSIONS = 768
+
+class Email(Base):
+    __tablename__ = "emails"
+
+    id = Column(Integer, primary_key=True, index=True)
+    message_id = Column(String, unique=True, index=True)
+    subject = Column(Text)
+    sender = Column(String)
+    date = Column(DateTime)
+    content = Column(Text)
+    embedding = Column(Vector(EMBEDDING_DIMENSIONS))
+
+def init_db():
+    # Install pgvector extension if not exists
+    with engine.connect() as conn:
+        conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
+        conn.commit()
+    Base.metadata.create_all(bind=engine)
+
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
--- a/api/indexer.py
+++ b/api/indexer.py
@@ -0,0 +1,149 @@
+import os
+import time
+import email
+from email.policy import default
+from bs4 import BeautifulSoup
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler
+from database import SessionLocal, Email, EMBEDDING_DIMENSIONS
+from google import genai
+from google.genai import types
+from datetime import datetime
+from sqlalchemy.exc import IntegrityError
+
+MAILDIR_PATH = os.environ.get("MAILDIR_PATH", "/Maildir")
+
+# Initialize Gemini client
+gemini_client = None
+api_key = os.environ.get("GEMINI_API_KEY")
+if api_key:
+    gemini_client = genai.Client(api_key=api_key)
+    print("Gemini client initialized for indexer.")
+else:
+    print("WARNING: GEMINI_API_KEY not set. Indexer will skip embedding generation.")
+
+def extract_text_from_email(msg):
+    text_content = ""
+    if msg.is_multipart():
+        for part in msg.walk():
+            content_type = part.get_content_type()
+            if content_type == "text/plain":
+                payload = part.get_payload(decode=True)
+                if payload:
+                    text_content += payload.decode('utf-8', errors='ignore') + "\n"
+            elif content_type == "text/html":
+                payload = part.get_payload(decode=True)
+                if payload:
+                    html_content = payload.decode('utf-8', errors='ignore')
+                    soup = BeautifulSoup(html_content, 'html.parser')
+                    text_content += soup.get_text(separator=' ') + "\n"
+    else:
+        content_type = msg.get_content_type()
+        payload = msg.get_payload(decode=True)
+        if payload:
+            if content_type == "text/html":
+                html_content = payload.decode('utf-8', errors='ignore')
+                soup = BeautifulSoup(html_content, 'html.parser')
+                text_content = soup.get_text(separator=' ')
+            else:
+                text_content = payload.decode('utf-8', errors='ignore')
+    return text_content.strip()
+
+def process_email_file(filepath):
+    print(f"Processing new email file: {filepath}")
+    if not gemini_client:
+        print("Skipping embedding generation: Gemini API key is missing.")
+        return
+
+    try:
+        with open(filepath, 'rb') as f:
+            msg = email.message_from_binary_file(f, policy=default)
+
+        message_id = msg.get('Message-ID', filepath)
+        subject = msg.get('Subject', '')
+        sender = msg.get('From', '')
+        date_str = msg.get('Date')
+        
+        try:
+            email_date = email.utils.parsedate_to_datetime(date_str) if date_str else datetime.utcnow()
+        except:
+            email_date = datetime.utcnow()
+
+        content = extract_text_from_email(msg)
+
+        if not content:
+            print(f"No text content found in {filepath}. Skipping.")
+            return
+
+        # Combine subject and content for better embedding
+        text_to_embed = f"Subject: {subject}\nSender: {sender}\n\n{content}"
+        
+        # Limit text to avoid token limits (very rough truncation)
+        text_to_embed = text_to_embed[:8000]
+
+        # Get embedding via Gemini — RETRIEVAL_DOCUMENT is the correct task type
+        # for content being stored and later retrieved by a query
+        response = gemini_client.models.embed_content(
+            model="gemini-embedding-001",
+            contents=text_to_embed,
+            config=types.EmbedContentConfig(
+                task_type="RETRIEVAL_DOCUMENT",
+                output_dimensionality=EMBEDDING_DIMENSIONS,
+            ),
+        )
+        embedding = response.embeddings[0].values
+
+        # Save to DB
+        db = SessionLocal()
+        try:
+            new_email = Email(
+                message_id=message_id,
+                subject=subject,
+                sender=sender,
+                date=email_date,
+                content=content,
+                embedding=embedding
+            )
+            db.add(new_email)
+            db.commit()
+            print(f"Successfully indexed email: {subject}")
+        except IntegrityError:
+            db.rollback()
+            print(f"Email {message_id} already exists in database.")
+        except Exception as e:
+            db.rollback()
+            print(f"Database error saving email: {e}")
+        finally:
+            db.close()
+
+    except Exception as e:
+        print(f"Error processing email {filepath}: {e}")
+
+class NewEmailHandler(FileSystemEventHandler):
+    def on_created(self, event):
+        if not event.is_directory:
+            # Simple check if it's likely an email file (mbsync creates files in cur/ or new/)
+            if 'new/' in event.src_path or 'cur/' in event.src_path:
+                process_email_file(event.src_path)
+
+def start_watching():
+    print(f"Starting to watch {MAILDIR_PATH} for new emails...")
+    
+    # Optional: Do a full initial sync of existing files here.
+    # We will skip that for brevity and just watch for new ones.
+    
+    event_handler = NewEmailHandler()
+    observer = Observer()
+    observer.schedule(event_handler, MAILDIR_PATH, recursive=True)
+    observer.start()
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        observer.stop()
+    observer.join()
+
+if __name__ == "__main__":
+    # Wait for DB to be initialized by FastAPI
+    time.sleep(5)
+    start_watching()
--- a/api/main.py
+++ b/api/main.py
@@ -0,0 +1,95 @@
+from fastapi import FastAPI, Depends, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+from database import get_db, init_db, Email, EMBEDDING_DIMENSIONS
+from pydantic import BaseModel
+from typing import List, Optional
+from google import genai
+from google.genai import types
+import os
+import time
+
+app = FastAPI(title="Unified Email Semantic Search API")
+
+# Setup CORS for the SPA
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Initialize Gemini client — reads GEMINI_API_KEY from environment
+gemini_client = None
+
+class SearchQuery(BaseModel):
+    query: str
+    limit: int = 10
+
+class SearchResult(BaseModel):
+    message_id: str
+    subject: str
+    sender: str
+    date: str
+    snippet: str
+    distance: float
+
+@app.on_event("startup")
+def on_startup():
+    global gemini_client
+    print("Initializing Database...")
+    time.sleep(2) # Give postgres a moment to be fully ready
+    try:
+        init_db()
+    except Exception as e:
+        print(f"Error initializing DB: {e}")
+
+    api_key = os.environ.get("GEMINI_API_KEY")
+    if api_key:
+        gemini_client = genai.Client(api_key=api_key)
+        print("Gemini client initialized.")
+    else:
+        print("WARNING: GEMINI_API_KEY not set. Embedding features disabled.")
+
+@app.post("/search", response_model=List[SearchResult])
+def search_emails(request: SearchQuery, db: Session = Depends(get_db)):
+    if not gemini_client:
+        raise HTTPException(status_code=500, detail="Gemini API Key is not configured.")
+
+    try:
+        response = gemini_client.models.embed_content(
+            model="gemini-embedding-001",
+            contents=request.query,
+            config=types.EmbedContentConfig(
+                task_type="RETRIEVAL_QUERY",
+                output_dimensionality=EMBEDDING_DIMENSIONS,
+            ),
+        )
+        query_embedding = response.embeddings[0].values
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Embedding API error: {e}")
+
+    # Use pgvector's cosine distance operator via SQLAlchemy ORM
+    results = db.query(
+        Email, 
+        Email.embedding.cosine_distance(query_embedding).label('distance')
+    ).order_by(
+        Email.embedding.cosine_distance(query_embedding)
+    ).limit(request.limit).all()
+
+    response_data = []
+    for email, distance in results:
+        # Create a snippet from the content
+        snippet = email.content[:200] + "..." if email.content and len(email.content) > 200 else (email.content or "")
+        response_data.append(SearchResult(
+            message_id=email.message_id or "",
+            subject=email.subject or "",
+            sender=email.sender or "",
+            date=email.date.isoformat() if email.date else "",
+            snippet=snippet,
+            distance=distance
+        ))
+
+    return response_data
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -0,0 +1,9 @@
+fastapi==0.111.0
+uvicorn==0.30.1
+sqlalchemy==2.0.30
+psycopg2-binary==2.9.9
+pgvector==0.2.5
+watchdog==4.0.1
+google-genai>=1.0.0
+beautifulsoup4==4.12.3
+pydantic==2.7.2
--- a/api/run.sh
+++ b/api/run.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+echo "Starting Uvicorn..."
+uvicorn main:app --host 0.0.0.0 --port 8000 &
+
+echo "Starting Indexer daemon..."
+python indexer.py &
+
+# Wait for any process to exit
+wait -n
+
+# Exit with status of process that exited first
+exit $?