diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8c7c690 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +sitemap.db +sitemap.db-journal +sitemap-api +go.sum \ No newline at end of file diff --git a/PROJECT_OVERVIEW.md b/Documentation/PROJECT_OVERVIEW.md similarity index 100% rename from PROJECT_OVERVIEW.md rename to Documentation/PROJECT_OVERVIEW.md diff --git a/QUICKSTART.md b/Documentation/QUICKSTART.md similarity index 100% rename from QUICKSTART.md rename to Documentation/QUICKSTART.md diff --git a/README.md b/Documentation/README.md similarity index 100% rename from README.md rename to Documentation/README.md diff --git a/Documentation/STRUCTURE.md b/Documentation/STRUCTURE.md new file mode 100644 index 0000000..258284f --- /dev/null +++ b/Documentation/STRUCTURE.md @@ -0,0 +1,280 @@ + +# 📁 SITEMAP-API PROJECT STRUCTURE + +## ROOT FILES +``` +main.go ⚙️ HTTP server, routes, middleware +go.mod 📦 Dependencies (chi, cors, uuid, sqlite3) +run.sh 🚀 Quick start script +Makefile 🔧 Build commands (run, build, clean, test) +Dockerfile 🐳 Container configuration +.gitignore 🚫 Git exclusions +.env.example ⚙️ Environment template +``` + +## DOCUMENTATION +``` +README.md 📖 Full API documentation +QUICKSTART.md ⏱️ 3-step quick start guide +PROJECT_OVERVIEW.md 📊 Complete implementation details +``` + +## CODE STRUCTURE + +### handlers/ +``` +└── handler.go 🎯 HTTP REQUEST HANDLERS + - GenerateSitemapXML() POST /generate-sitemap-xml + - StreamSSE() GET /stream/{uuid} + - DownloadSitemap() GET /download/{uuid} + - GetSites() GET /sites + - GetSite() GET /sites/{id} + - DeleteSite() DELETE /sites/{id} + - Health() GET /health + + 🔄 STREAM MANAGER + - NewStreamManager() Concurrent SSE handling + - CreateStream() Per-UUID channels + - GetStream() Retrieve channel + - CloseStream() Cleanup + + 🔍 METADATA EXTRACTORS + - getClientIP() IP address + - parseBrowser() Browser detection + - parseOS() OS detection + - parseDeviceType() Device detection + - extractCookies() Cookie parsing + - getOrCreateSession() Session management +``` + +### crawler/ +``` +└── crawler.go 🕷️ WEB CRAWLER ENGINE + - NewCrawler() Initialize crawler + - Crawl() Main crawl orchestrator + - crawlURL() Recursive URL processing + - extractLinks() HTML link extraction + - resolveURL() Relative → absolute + - normalizeURL() URL canonicalization + - isSameDomain() Domain validation + - calculatePriority() Sitemap priority (0-1.0) + - sendEvent() SSE event emission +``` + +### database/ +``` +└── db.go 💾 SQLITE DATABASE LAYER + - NewDB() Initialize DB + - createTables() Schema setup + - CreateSite() Insert site + - GetSiteByUUID() Fetch by UUID + - GetSiteByID() Fetch by ID + - GetAllSites() List all + - UpdateSiteStatus() Mark complete/failed + - DeleteSite() Remove site + - AddPage() Insert page + - GetPagesBySiteID() Fetch pages +``` + +### models/ +``` +└── site.go 📋 DATA STRUCTURES + - Site Main site record + - Page Discovered page + - Event SSE event + - ProgressData Progress payload + - CompleteData Completion payload + - ErrorData Error payload +``` + +### static/ +``` +└── index.html 🎨 FRONTEND APPLICATION + HTML: + - Form section URL input, depth selector + - Progress section Live stats, progress bar + - Log section Activity console + - Results section Site list, download buttons + + JavaScript: + - SitemapGenerator class Main controller + - generateSitemap() POST to API + - connectToStream() SSE connection + - updateProgress() Live UI updates + - downloadSitemap() File download + - loadExistingSites() Fetch site list + - displaySites() Render results +``` + +## RUNTIME GENERATED +``` +sitemap.db 💾 SQLite database (auto-created on first run) +sitemap.db-journal 📝 SQLite temp file +sitemap-api ⚙️ Compiled binary (from: go build) +go.sum 🔒 Dependency checksums (from: go mod download) +``` + +## FILE COUNTS +``` +Go source files: 5 files +HTML files: 1 file +Documentation: 3 files +Config files: 6 files +───────────────────────── +Total: 15 files +``` + +## LINES OF CODE +``` +handlers/handler.go ~600 lines (HTTP handlers, SSE, metadata) +crawler/crawler.go ~250 lines (Concurrent crawler) +database/db.go ~250 lines (SQLite operations) +models/site.go ~50 lines (Data structures) +main.go ~70 lines (Server setup) +static/index.html ~850 lines (Full UI with CSS & JS) +───────────────────────────────────── +Total: ~2,070 lines +``` + +## KEY DEPENDENCIES (go.mod) +``` +github.com/go-chi/chi/v5 Router & middleware +github.com/go-chi/cors CORS support +github.com/google/uuid UUID generation +github.com/mattn/go-sqlite3 SQLite driver +golang.org/x/net HTML parsing +``` + +## VISUAL TREE +``` +sitemap-api/ +│ +├── 📄 main.go # Entry point & server +├── 📦 go.mod # Dependencies +├── 🚀 run.sh # Quick start +├── 🔧 Makefile # Build commands +├── 🐳 Dockerfile # Containerization +├── ⚙️ .env.example # Config template +├── 🚫 .gitignore # Git exclusions +│ +├── 📚 Documentation/ +│ ├── README.md # Full docs +│ ├── QUICKSTART.md # Quick start +│ └── PROJECT_OVERVIEW.md # Implementation details +│ +├── 🎯 handlers/ +│ └── handler.go # All HTTP endpoints + SSE +│ +├── 🕷️ crawler/ +│ └── crawler.go # Concurrent web crawler +│ +├── 💾 database/ +│ └── db.go # SQLite operations +│ +├── 📋 models/ +│ └── site.go # Data structures +│ +└── 🎨 static/ + └── index.html # Frontend UI +``` + +## DATA FLOW +``` +User Browser + │ + ├─► POST /generate-sitemap-xml + │ └─► handlers.GenerateSitemapXML() + │ ├─► Generate UUID + │ ├─► Extract metadata (IP, browser, etc) + │ ├─► database.CreateSite() + │ ├─► streamManager.CreateStream(uuid) + │ ├─► go crawler.Crawl() [goroutine] + │ └─► Return {uuid, site_id, stream_url} + │ + ├─► GET /stream/{uuid} + │ └─► handlers.StreamSSE() + │ └─► streamManager.GetStream(uuid) + │ └─► Forward events to browser + │ + └─► GET /download/{uuid} + └─► handlers.DownloadSitemap() + ├─► database.GetSiteByUUID() + ├─► database.GetPagesBySiteID() + └─► Generate XML sitemap + +Crawler (goroutine) + │ + ├─► Fetch URL + ├─► Parse HTML links + ├─► database.AddPage() + ├─► Send SSE progress event + └─► Recursively crawl children (with goroutines) +``` + +## DATABASE SCHEMA +``` +┌─────────────────┐ +│ sites │ +├─────────────────┤ +│ id │ PK +│ uuid │ UNIQUE (server-generated) +│ domain │ +│ url │ +│ max_depth │ +│ page_count │ +│ status │ (processing/completed/failed) +│ ip_address │ (client metadata) +│ user_agent │ +│ browser │ +│ browser_version │ +│ os │ +│ device_type │ +│ session_id │ +│ cookies │ (JSON) +│ referrer │ +│ created_at │ +│ completed_at │ +│ last_crawled │ +└─────────────────┘ + │ + │ 1:N + ↓ +┌─────────────────┐ +│ pages │ +├─────────────────┤ +│ id │ PK +│ site_id │ FK → sites.id +│ url │ UNIQUE +│ depth │ +│ last_modified │ +│ priority │ (0.0 - 1.0) +│ change_freq │ (monthly/weekly/etc) +└─────────────────┘ + +┌─────────────────┐ +│ sessions │ +├─────────────────┤ +│ id │ PK +│ session_id │ UNIQUE +│ uuid │ FK → sites.uuid +│ ip_address │ +│ created_at │ +│ last_activity │ +└─────────────────┘ +``` + +## CONCURRENCY MODEL +``` +StreamManager + ├─► map[uuid]chan Event (thread-safe with mutex) + │ + └─► Per-UUID Channel + └─► Event stream to browser + +Crawler + ├─► Main goroutine (Crawl) + │ └─► Spawns goroutines for each URL + │ + └─► Semaphore (5 concurrent max) + └─► Controls parallel requests +``` diff --git a/crawler.go b/crawler/crawler.go similarity index 100% rename from crawler.go rename to crawler/crawler.go diff --git a/db.go b/database/db.go similarity index 100% rename from db.go rename to database/db.go diff --git a/handler.go b/handlers/handler.go similarity index 100% rename from handler.go rename to handlers/handler.go diff --git a/site.go b/models/site.go similarity index 100% rename from site.go rename to models/site.go diff --git a/index.html b/static/index.html similarity index 100% rename from index.html rename to static/index.html