beforeLocal
Some checks failed
Deploy to Server / deploy (push) Failing after 5s

This commit is contained in:
2026-05-05 00:47:39 +00:00
parent fdf2cf3659
commit 70ee32efdd
31 changed files with 1169 additions and 0 deletions

18
api/Dockerfile Normal file
View File

@@ -0,0 +1,18 @@
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies for building psycopg2 and others
RUN apt-get update && apt-get install -y \
gcc \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
RUN chmod +x run.sh
ENTRYPOINT ["./run.sh"]

44
api/database.py Normal file
View File

@@ -0,0 +1,44 @@
import os
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, text
from sqlalchemy.orm import declarative_base, sessionmaker
from pgvector.sqlalchemy import Vector
POSTGRES_USER = os.environ.get("POSTGRES_USER", "allmail")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD", "postgres")
POSTGRES_DB = os.environ.get("POSTGRES_DB", "emails_db")
DB_HOST = os.environ.get("DB_HOST", "localhost")
DATABASE_URL = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{DB_HOST}/{POSTGRES_DB}"
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
# Gemini gemini-embedding-001 with output_dimensionality=768
EMBEDDING_DIMENSIONS = 768
class Email(Base):
__tablename__ = "emails"
id = Column(Integer, primary_key=True, index=True)
message_id = Column(String, unique=True, index=True)
subject = Column(Text)
sender = Column(String)
date = Column(DateTime)
content = Column(Text)
embedding = Column(Vector(EMBEDDING_DIMENSIONS))
def init_db():
# Install pgvector extension if not exists
with engine.connect() as conn:
conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
conn.commit()
Base.metadata.create_all(bind=engine)
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()

149
api/indexer.py Normal file
View File

@@ -0,0 +1,149 @@
import os
import time
import email
from email.policy import default
from bs4 import BeautifulSoup
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from database import SessionLocal, Email, EMBEDDING_DIMENSIONS
from google import genai
from google.genai import types
from datetime import datetime
from sqlalchemy.exc import IntegrityError
MAILDIR_PATH = os.environ.get("MAILDIR_PATH", "/Maildir")
# Initialize Gemini client
gemini_client = None
api_key = os.environ.get("GEMINI_API_KEY")
if api_key:
gemini_client = genai.Client(api_key=api_key)
print("Gemini client initialized for indexer.")
else:
print("WARNING: GEMINI_API_KEY not set. Indexer will skip embedding generation.")
def extract_text_from_email(msg):
text_content = ""
if msg.is_multipart():
for part in msg.walk():
content_type = part.get_content_type()
if content_type == "text/plain":
payload = part.get_payload(decode=True)
if payload:
text_content += payload.decode('utf-8', errors='ignore') + "\n"
elif content_type == "text/html":
payload = part.get_payload(decode=True)
if payload:
html_content = payload.decode('utf-8', errors='ignore')
soup = BeautifulSoup(html_content, 'html.parser')
text_content += soup.get_text(separator=' ') + "\n"
else:
content_type = msg.get_content_type()
payload = msg.get_payload(decode=True)
if payload:
if content_type == "text/html":
html_content = payload.decode('utf-8', errors='ignore')
soup = BeautifulSoup(html_content, 'html.parser')
text_content = soup.get_text(separator=' ')
else:
text_content = payload.decode('utf-8', errors='ignore')
return text_content.strip()
def process_email_file(filepath):
print(f"Processing new email file: {filepath}")
if not gemini_client:
print("Skipping embedding generation: Gemini API key is missing.")
return
try:
with open(filepath, 'rb') as f:
msg = email.message_from_binary_file(f, policy=default)
message_id = msg.get('Message-ID', filepath)
subject = msg.get('Subject', '')
sender = msg.get('From', '')
date_str = msg.get('Date')
try:
email_date = email.utils.parsedate_to_datetime(date_str) if date_str else datetime.utcnow()
except:
email_date = datetime.utcnow()
content = extract_text_from_email(msg)
if not content:
print(f"No text content found in {filepath}. Skipping.")
return
# Combine subject and content for better embedding
text_to_embed = f"Subject: {subject}\nSender: {sender}\n\n{content}"
# Limit text to avoid token limits (very rough truncation)
text_to_embed = text_to_embed[:8000]
# Get embedding via Gemini — RETRIEVAL_DOCUMENT is the correct task type
# for content being stored and later retrieved by a query
response = gemini_client.models.embed_content(
model="gemini-embedding-001",
contents=text_to_embed,
config=types.EmbedContentConfig(
task_type="RETRIEVAL_DOCUMENT",
output_dimensionality=EMBEDDING_DIMENSIONS,
),
)
embedding = response.embeddings[0].values
# Save to DB
db = SessionLocal()
try:
new_email = Email(
message_id=message_id,
subject=subject,
sender=sender,
date=email_date,
content=content,
embedding=embedding
)
db.add(new_email)
db.commit()
print(f"Successfully indexed email: {subject}")
except IntegrityError:
db.rollback()
print(f"Email {message_id} already exists in database.")
except Exception as e:
db.rollback()
print(f"Database error saving email: {e}")
finally:
db.close()
except Exception as e:
print(f"Error processing email {filepath}: {e}")
class NewEmailHandler(FileSystemEventHandler):
def on_created(self, event):
if not event.is_directory:
# Simple check if it's likely an email file (mbsync creates files in cur/ or new/)
if 'new/' in event.src_path or 'cur/' in event.src_path:
process_email_file(event.src_path)
def start_watching():
print(f"Starting to watch {MAILDIR_PATH} for new emails...")
# Optional: Do a full initial sync of existing files here.
# We will skip that for brevity and just watch for new ones.
event_handler = NewEmailHandler()
observer = Observer()
observer.schedule(event_handler, MAILDIR_PATH, recursive=True)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
if __name__ == "__main__":
# Wait for DB to be initialized by FastAPI
time.sleep(5)
start_watching()

95
api/main.py Normal file
View File

@@ -0,0 +1,95 @@
from fastapi import FastAPI, Depends, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from sqlalchemy.orm import Session
from sqlalchemy import text
from database import get_db, init_db, Email, EMBEDDING_DIMENSIONS
from pydantic import BaseModel
from typing import List, Optional
from google import genai
from google.genai import types
import os
import time
app = FastAPI(title="Unified Email Semantic Search API")
# Setup CORS for the SPA
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize Gemini client — reads GEMINI_API_KEY from environment
gemini_client = None
class SearchQuery(BaseModel):
query: str
limit: int = 10
class SearchResult(BaseModel):
message_id: str
subject: str
sender: str
date: str
snippet: str
distance: float
@app.on_event("startup")
def on_startup():
global gemini_client
print("Initializing Database...")
time.sleep(2) # Give postgres a moment to be fully ready
try:
init_db()
except Exception as e:
print(f"Error initializing DB: {e}")
api_key = os.environ.get("GEMINI_API_KEY")
if api_key:
gemini_client = genai.Client(api_key=api_key)
print("Gemini client initialized.")
else:
print("WARNING: GEMINI_API_KEY not set. Embedding features disabled.")
@app.post("/search", response_model=List[SearchResult])
def search_emails(request: SearchQuery, db: Session = Depends(get_db)):
if not gemini_client:
raise HTTPException(status_code=500, detail="Gemini API Key is not configured.")
try:
response = gemini_client.models.embed_content(
model="gemini-embedding-001",
contents=request.query,
config=types.EmbedContentConfig(
task_type="RETRIEVAL_QUERY",
output_dimensionality=EMBEDDING_DIMENSIONS,
),
)
query_embedding = response.embeddings[0].values
except Exception as e:
raise HTTPException(status_code=500, detail=f"Embedding API error: {e}")
# Use pgvector's cosine distance operator via SQLAlchemy ORM
results = db.query(
Email,
Email.embedding.cosine_distance(query_embedding).label('distance')
).order_by(
Email.embedding.cosine_distance(query_embedding)
).limit(request.limit).all()
response_data = []
for email, distance in results:
# Create a snippet from the content
snippet = email.content[:200] + "..." if email.content and len(email.content) > 200 else (email.content or "")
response_data.append(SearchResult(
message_id=email.message_id or "",
subject=email.subject or "",
sender=email.sender or "",
date=email.date.isoformat() if email.date else "",
snippet=snippet,
distance=distance
))
return response_data

9
api/requirements.txt Normal file
View File

@@ -0,0 +1,9 @@
fastapi==0.111.0
uvicorn==0.30.1
sqlalchemy==2.0.30
psycopg2-binary==2.9.9
pgvector==0.2.5
watchdog==4.0.1
google-genai>=1.0.0
beautifulsoup4==4.12.3
pydantic==2.7.2

13
api/run.sh Normal file
View File

@@ -0,0 +1,13 @@
#!/bin/bash
echo "Starting Uvicorn..."
uvicorn main:app --host 0.0.0.0 --port 8000 &
echo "Starting Indexer daemon..."
python indexer.py &
# Wait for any process to exit
wait -n
# Exit with status of process that exited first
exit $?