This commit is contained in:
Kar
2026-02-05 19:13:45 +05:30
commit 10b19d4ed6
13 changed files with 2828 additions and 0 deletions

38
Dockerfile Normal file
View File

@@ -0,0 +1,38 @@
# Build stage
FROM golang:1.21-alpine AS builder
# Install build dependencies
RUN apk add --no-cache git gcc musl-dev sqlite-dev
WORKDIR /app
# Copy go mod files
COPY go.mod go.sum ./
RUN go mod download
# Copy source code
COPY . .
# Build the application
RUN CGO_ENABLED=1 GOOS=linux go build -a -installsuffix cgo -o sitemap-api .
# Final stage
FROM alpine:latest
# Install runtime dependencies
RUN apk --no-cache add ca-certificates sqlite-libs
WORKDIR /root/
# Copy binary from builder
COPY --from=builder /app/sitemap-api .
COPY --from=builder /app/static ./static
# Expose port
EXPOSE 8080
# Set environment
ENV PORT=8080
# Run the application
CMD ["./sitemap-api"]

61
Makefile Normal file
View File

@@ -0,0 +1,61 @@
.PHONY: help build run clean test install dev
help: ## Show this help message
@echo "XML Sitemap Generator API - Make Commands"
@echo ""
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m %s\n", $$1, $$2}'
install: ## Install Go dependencies
@echo "📦 Installing dependencies..."
@go mod download
@echo "✅ Dependencies installed"
build: ## Build the application
@echo "🔨 Building..."
@go build -o sitemap-api .
@echo "✅ Build complete: ./sitemap-api"
run: build ## Build and run the application
@echo "🚀 Starting server on http://localhost:8080"
@./sitemap-api
dev: ## Run in development mode (with hot reload if air is installed)
@if command -v air > /dev/null; then \
air; \
else \
echo "💡 Tip: Install 'air' for hot reload: go install github.com/cosmtrek/air@latest"; \
$(MAKE) run; \
fi
clean: ## Clean build artifacts and database
@echo "🧹 Cleaning..."
@rm -f sitemap-api
@rm -f *.db
@rm -f *.db-journal
@echo "✅ Clean complete"
test: ## Run tests
@echo "🧪 Running tests..."
@go test -v ./...
format: ## Format code
@echo "📝 Formatting code..."
@go fmt ./...
@echo "✅ Code formatted"
lint: ## Run linter (requires golangci-lint)
@echo "🔍 Running linter..."
@if command -v golangci-lint > /dev/null; then \
golangci-lint run; \
else \
echo "❌ golangci-lint not installed. Install: https://golangci-lint.run/usage/install/"; \
fi
docker-build: ## Build Docker image
@echo "🐳 Building Docker image..."
@docker build -t sitemap-api .
@echo "✅ Docker image built: sitemap-api"
docker-run: docker-build ## Run in Docker container
@echo "🐳 Running in Docker..."
@docker run -p 8080:8080 sitemap-api

447
PROJECT_OVERVIEW.md Normal file
View File

@@ -0,0 +1,447 @@
# 🗺️ XML Sitemap Generator - Complete Implementation
## Project Overview
A production-ready Go API for generating XML sitemaps with real-time progress tracking. Built with concurrent crawling, SSE streaming, and comprehensive client metadata tracking.
## ✨ Key Features Implemented
### 1. **Backend-Generated UUID System**
- Server generates unique UUID for each crawl request
- UUID used for SSE stream connection and file download
- Enables true multi-user support with isolated streams
### 2. **Server-Sent Events (SSE) Streaming**
- Real-time progress updates via `/stream/{uuid}`
- Event types: `connected`, `started`, `progress`, `complete`, `error`
- Non-blocking concurrent stream management
- Automatic cleanup after completion
### 3. **Concurrent Web Crawler**
- Goroutine-based parallel crawling
- Configurable concurrency limit (default: 5 parallel requests)
- Depth-limited crawling (1-5 levels)
- Same-domain restriction with URL normalization
- Duplicate detection and prevention
### 4. **Client Metadata Tracking**
Automatically captured and stored in SQLite:
- IP Address (with X-Forwarded-For support)
- User-Agent string
- Browser name & version (Chrome, Firefox, Safari, Edge, Opera)
- Operating System (Windows, macOS, Linux, Android, iOS)
- Device Type (Desktop, Mobile, Tablet)
- Session ID (cookie-based persistence)
- All cookies (JSON-encoded)
- HTTP Referrer
### 5. **RESTful API Endpoints**
```
POST /generate-sitemap-xml → Start crawl, returns UUID
GET /stream/{uuid} → SSE progress stream
GET /download/{uuid} → Download XML sitemap
GET /sites → List all sitemaps
GET /sites/{id} → Get specific site
DELETE /sites/{id} → Delete sitemap
GET /health → Health check
GET / → Serve frontend HTML
```
### 6. **Beautiful Frontend UI**
- Responsive gradient design
- Real-time progress visualization
- Live connection status indicator
- Crawl statistics (pages found, depth, time)
- Activity log with color-coded entries
- Site management (view, download, delete)
- Auto-protocol addition for URLs
## 🏗️ Architecture
```
┌─────────────┐
│ Browser │
│ (Frontend) │
└──────┬──────┘
│ POST /generate-sitemap-xml
┌──────────────────────────────────┐
│ Go HTTP Server (Chi Router) │
│ │
│ ┌────────────────────────────┐ │
│ │ Handler (handler.go) │ │
│ │ - Generate UUID │ │
│ │ - Extract metadata │ │
│ │ - Create DB record │ │
│ │ - Spawn crawler │ │
│ │ - Return UUID immediately│ │
│ └─────────────┬──────────────┘ │
└────────────────┼────────────────┘
┌─────────┴─────────┐
│ │
↓ ↓
┌──────────────┐ ┌───────────────┐
│ StreamManager│ │ Crawler │
│ │ │ │
│ UUID → Chan │ │ Goroutines │
│ Map storage │←──│ Concurrent │
│ │ │ HTTP requests│
└──────┬───────┘ └───────┬───────┘
│ │
│ SSE Events │ Save pages
↓ ↓
┌──────────────────────────────────┐
│ SQLite Database │
│ - sites (with metadata) │
│ - pages (discovered URLs) │
│ - sessions (tracking) │
└──────────────────────────────────┘
```
## 📂 File Structure
```
sitemap-api/
├── main.go # HTTP server setup, routes
├── go.mod # Go module dependencies
├── go.sum # Dependency checksums
├── handlers/
│ └── handler.go # All HTTP handlers
│ - GenerateSitemapXML # POST endpoint
│ - StreamSSE # SSE streaming
│ - DownloadSitemap # XML generation
│ - GetSites/GetSite # CRUD operations
│ - DeleteSite # Cleanup
│ - StreamManager # Concurrent stream management
├── crawler/
│ └── crawler.go # Web crawler implementation
│ - Crawl() # Main crawl logic
│ - crawlURL() # Recursive URL processing
│ - extractLinks() # HTML parsing
│ - normalizeURL() # URL canonicalization
│ - isSameDomain() # Domain checking
│ - calculatePriority() # Sitemap priority
├── database/
│ └── db.go # SQLite operations
│ - NewDB() # Initialize DB
│ - createTables() # Schema creation
│ - CreateSite() # Insert site record
│ - GetSiteByUUID() # Retrieve by UUID
│ - UpdateSiteStatus() # Mark complete
│ - AddPage() # Save discovered page
│ - GetPagesBySiteID() # Retrieve all pages
│ - DeleteSite() # Cascade delete
├── models/
│ └── site.go # Data structures
│ - Site # Site record
│ - Page # Page record
│ - Event # SSE event
│ - ProgressData # Progress payload
│ - CompleteData # Completion payload
│ - ErrorData # Error payload
├── static/
│ └── index.html # Frontend application
│ - SitemapGenerator # Main class
│ - generateSitemap() # Initiate crawl
│ - connectToStream() # SSE connection
│ - updateProgress() # Live updates
│ - downloadSitemap() # File download
│ - displaySites() # Results listing
├── README.md # Full documentation
├── QUICKSTART.md # Quick start guide
├── Makefile # Build automation
├── Dockerfile # Container setup
├── run.sh # Startup script
├── .gitignore # Git exclusions
└── .env.example # Environment template
```
## 🔄 Request Flow
### 1. Generate Sitemap Request
```
User fills form → POST /generate-sitemap-xml
Server generates UUID
Extract IP, UA, cookies, session
Save to database (status: processing)
Create SSE channel in StreamManager
Spawn goroutine for crawler (non-blocking)
Return UUID immediately to frontend
```
### 2. SSE Stream Connection
```
Frontend receives UUID → GET /stream/{uuid}
StreamManager finds channel
Send "connected" event
Crawler sends events to channel
Handler forwards to browser
Frontend updates UI in real-time
```
### 3. Crawler Operation
```
Start from root URL → Fetch HTML
Parse <a> tags for links
Check: same domain? not visited?
Save page to database (URL, depth, priority)
Send "progress" event via channel
Spawn goroutines for child URLs
Repeat until max depth reached
Send "complete" event
Close channel, cleanup resources
```
### 4. Download Request
```
User clicks download → GET /download/{uuid}
Lookup site by UUID
Fetch all pages from database
Generate XML sitemap
Set Content-Disposition header
Stream XML to browser
```
## 🔐 Security Considerations
### Implemented
- ✅ Same-domain restriction (no external crawling)
- ✅ Max depth limit (prevents infinite loops)
- ✅ HTTP timeout per request (10 seconds)
- ✅ Duplicate URL prevention
- ✅ SQLite prepared statements (SQL injection safe)
- ✅ CORS middleware included
### Recommended for Production
- [ ] Rate limiting per IP
- [ ] Authentication/API keys
- [ ] Input validation & sanitization
- [ ] Request size limits
- [ ] robots.txt respect
- [ ] User-Agent identification
- [ ] HTTPS enforcement
- [ ] Firewall rules
## 🚀 Performance Optimization
### Current
- Concurrent goroutines (5 parallel requests default)
- Non-blocking SSE streams
- Efficient channel-based communication
- In-memory visited URL tracking
- Database connection pooling
### Possible Improvements
- Redis for distributed crawling
- Worker pool pattern
- Content caching
- Incremental sitemap updates
- Compression for large sitemaps
- Database indexing optimization
## 📊 Database Schema
### sites table
```sql
- id (PK) - Auto-increment
- uuid (UNIQUE) - Server-generated UUID
- domain - Extracted from URL
- url - Full starting URL
- max_depth - Crawl depth limit
- page_count - Total pages found
- status - processing/completed/failed
- ip_address - Client IP
- user_agent - Full UA string
- browser - Parsed browser name
- browser_version - Version number
- os - Operating system
- device_type - Desktop/Mobile/Tablet
- session_id - Cookie-based session
- cookies - JSON of all cookies
- referrer - HTTP Referer header
- created_at - Timestamp
- completed_at - Completion timestamp
- last_crawled - Last activity
```
### pages table
```sql
- id (PK) - Auto-increment
- site_id (FK) - References sites(id)
- url - Page URL (UNIQUE)
- depth - Crawl depth level
- last_modified - Discovery time
- priority - Sitemap priority (0.0-1.0)
- change_freq - monthly/weekly/daily/etc
```
### sessions table
```sql
- id (PK) - Auto-increment
- session_id (UNIQUE) - Session UUID
- uuid (FK) - References sites(uuid)
- ip_address - Client IP
- created_at - First seen
- last_activity - Last request
```
## 🧪 Testing
### Manual Testing
```bash
# Terminal 1: Start server
./run.sh
# Terminal 2: Test API
curl -X POST http://localhost:8080/generate-sitemap-xml \
-H "Content-Type: application/json" \
-d '{"url":"https://example.com","max_depth":2}'
# Terminal 3: Watch SSE stream
curl -N http://localhost:8080/stream/{uuid}
```
### Browser Testing
1. Open multiple tabs to http://localhost:8080
2. Start different crawls simultaneously
3. Verify independent progress tracking
4. Check database for metadata
### Database Verification
```bash
sqlite3 sitemap.db "SELECT * FROM sites ORDER BY created_at DESC LIMIT 5;"
sqlite3 sitemap.db "SELECT COUNT(*) FROM pages WHERE site_id = 1;"
```
## 📦 Deployment Options
### Option 1: Binary
```bash
go build -o sitemap-api
./sitemap-api
```
### Option 2: Docker
```bash
docker build -t sitemap-api .
docker run -p 8080:8080 sitemap-api
```
### Option 3: Systemd Service
```ini
[Unit]
Description=Sitemap Generator API
After=network.target
[Service]
Type=simple
User=www-data
WorkingDirectory=/opt/sitemap-api
ExecStart=/opt/sitemap-api/sitemap-api
Restart=always
[Install]
WantedBy=multi-user.target
```
## 🔧 Configuration
### Environment Variables
```bash
export PORT=8080 # Server port
export DB_PATH=sitemap.db # Database file
```
### Code Constants
```go
// crawler/crawler.go
const maxConcurrent = 5 // Parallel requests
const httpTimeout = 10 // Seconds
// handlers/handler.go
const channelBuffer = 100 // SSE event buffer
```
## 📝 XML Sitemap Format
Generated sitemaps follow the standard:
```xml
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/</loc>
<lastmod>2024-02-05</lastmod>
<changefreq>monthly</changefreq>
<priority>1.0</priority>
</url>
<url>
<loc>https://example.com/about</loc>
<lastmod>2024-02-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
</urlset>
```
## 🎯 Success Criteria
All requirements met:
- ✅ Go backend with excellent performance
- ✅ Endpoint: `/generate-sitemap-xml` with UUID response
- ✅ Endpoint: `/stream/{uuid}` for SSE
- ✅ Endpoint: `/download/{uuid}` for XML
- ✅ Multi-user concurrent support
- ✅ Client metadata tracking (IP, browser, cookies, session)
- ✅ SQLite storage
- ✅ Root route `/` serves HTML
- ✅ Real-time progress updates
- ✅ Clean, maintainable code structure
## 📚 Next Steps
To extend this project:
1. Add user authentication (JWT tokens)
2. Implement rate limiting (go-rate package)
3. Add robots.txt parsing (robotstxt.go package)
4. Support sitemap index for large sites
5. Add scheduling/cron jobs for recurring crawls
6. Implement incremental updates
7. Add webhook notifications
8. Create admin dashboard
9. Export to other formats (JSON, CSV)
10. Add analytics and usage stats
---
**Ready to use! Just run `./run.sh` or `make run` to get started.**

152
QUICKSTART.md Normal file
View File

@@ -0,0 +1,152 @@
# 🚀 Quick Start Guide
Get your sitemap generator running in 3 steps!
## Step 1: Install Go
If you don't have Go installed:
- Download from https://golang.org/dl/
- Install Go 1.21 or later
- Verify: `go version`
## Step 2: Run the Application
### Option A: Using the run script (easiest)
```bash
cd sitemap-api
./run.sh
```
### Option B: Using Make
```bash
cd sitemap-api
make run
```
### Option C: Manual
```bash
cd sitemap-api
go mod download
go build -o sitemap-api .
./sitemap-api
```
## Step 3: Use the Application
1. **Open your browser** → http://localhost:8080
2. **Enter a URL** → e.g., `https://example.com`
3. **Set crawl depth** → 1-5 (default: 3)
4. **Click "Generate Sitemap"** → Watch real-time progress!
5. **Download XML** → Click the download button when complete
## Testing Multiple Users
Open multiple browser tabs to http://localhost:8080 and start different crawls simultaneously. Each will have its own UUID and progress stream!
## API Usage Examples
### Start a crawl
```bash
curl -X POST http://localhost:8080/generate-sitemap-xml \
-H "Content-Type: application/json" \
-d '{"url": "https://example.com", "max_depth": 3}'
```
Response:
```json
{
"uuid": "550e8400-e29b-41d4-a716-446655440000",
"site_id": 123,
"status": "processing",
"stream_url": "/stream/550e8400-e29b-41d4-a716-446655440000",
"message": "Sitemap generation started"
}
```
### Monitor progress (SSE)
```bash
curl http://localhost:8080/stream/550e8400-e29b-41d4-a716-446655440000
```
### Download sitemap
```bash
curl http://localhost:8080/download/550e8400-e29b-41d4-a716-446655440000 -o sitemap.xml
```
### List all sitemaps
```bash
curl http://localhost:8080/sites
```
### Delete a sitemap
```bash
curl -X DELETE http://localhost:8080/sites/123
```
## Troubleshooting
### Port already in use
```bash
PORT=3000 ./sitemap-api
```
### Build errors
```bash
go mod tidy
go clean -cache
go build -o sitemap-api .
```
### Database locked
```bash
rm sitemap.db
./sitemap-api
```
### CGO errors
Make sure you have gcc installed:
- **Ubuntu/Debian**: `sudo apt-get install build-essential`
- **macOS**: `xcode-select --install`
- **Windows**: Install MinGW or TDM-GCC
## Next Steps
- Read the full [README.md](README.md) for details
- Customize the crawler in `crawler/crawler.go`
- Add authentication to handlers
- Deploy to production (see README for nginx config)
- Add more metadata tracking
## Project Structure
```
sitemap-api/
├── main.go # Server entry point
├── handlers/ # HTTP handlers & SSE
├── crawler/ # Web crawler logic
├── database/ # SQLite operations
├── models/ # Data structures
├── static/ # Frontend (served at /)
├── README.md # Full documentation
├── run.sh # Quick start script
├── Makefile # Build commands
└── Dockerfile # Container setup
```
## Support
Having issues? Check:
1. Go version >= 1.21
2. Port 8080 is available
3. SQLite3 is working
4. All dependencies installed
Still stuck? Open an issue on GitHub!
---
**Built with ❤️ using Go + Goroutines + Server-Sent Events**

213
README.md Normal file
View File

@@ -0,0 +1,213 @@
# XML Sitemap Generator API
A high-performance Go-based API for generating XML sitemaps with real-time progress tracking via Server-Sent Events (SSE).
## Features
-**Concurrent Web Crawling** - Fast sitemap generation using goroutines
-**Real-time Progress** - SSE streaming for live updates
-**Multi-user Support** - Handle multiple simultaneous crawls
-**Client Metadata Tracking** - IP, browser, OS, session data stored in SQLite
-**Clean REST API** - Simple endpoints for generate, stream, and download
-**Professional UI** - Beautiful web interface included
## Architecture
```
sitemap-api/
├── main.go # Entry point & HTTP server
├── handlers/
│ └── handler.go # HTTP handlers & SSE streaming
├── crawler/
│ └── crawler.go # Concurrent web crawler
├── database/
│ └── db.go # SQLite operations
├── models/
│ └── site.go # Data structures
└── static/
└── index.html # Frontend UI
```
## API Endpoints
### `POST /generate-sitemap-xml`
Start sitemap generation (backend generates UUID)
**Request:**
```json
{
"url": "https://example.com",
"max_depth": 3
}
```
**Response:**
```json
{
"uuid": "550e8400-e29b-41d4-a716-446655440000",
"site_id": 123,
"status": "processing",
"stream_url": "/stream/550e8400-...",
"message": "Sitemap generation started"
}
```
### `GET /stream/{uuid}`
Server-Sent Events stream for real-time progress
**Events:** `connected`, `started`, `progress`, `complete`, `error`
### `GET /download/{uuid}`
Download generated sitemap XML
### `GET /sites`
List all generated sitemaps
### `GET /sites/{id}`
Get specific site details
### `DELETE /sites/{id}`
Delete a sitemap
### `GET /health`
Health check endpoint
## Installation
### Prerequisites
- Go 1.21+
- SQLite3
### Setup
```bash
# Clone/navigate to directory
cd sitemap-api
# Install dependencies
go mod download
# Build
go build -o sitemap-api
# Run
./sitemap-api
```
Server starts on **http://localhost:8080**
### Or run directly:
```bash
go run main.go
```
## Usage
1. Open http://localhost:8080 in your browser
2. Enter a website URL
3. Set crawl depth (1-5)
4. Click "Generate Sitemap"
5. Watch real-time progress
6. Download XML when complete
## Database Schema
SQLite database (`sitemap.db`) stores:
- **sites** - Crawl sessions with client metadata
- **pages** - Discovered URLs with priority/frequency
- **sessions** - User session tracking
## Environment Variables
- `PORT` - Server port (default: 8080)
Example:
```bash
PORT=3000 ./sitemap-api
```
## How It Works
1. **Frontend** sends POST to `/generate-sitemap-xml`
2. **Backend** generates UUID, saves metadata, returns UUID
3. **Frontend** connects to `/stream/{uuid}` for SSE updates
4. **Crawler** runs in goroutine, sends events via channel
5. **Handler** streams events to frontend in real-time
6. **On completion**, sitemap available at `/download/{uuid}`
## Multi-User Concurrency
The `StreamManager` handles concurrent users:
- Each UUID maps to a Go channel
- Concurrent map with mutex for thread safety
- Automatic cleanup after crawl completion
- Supports unlimited simultaneous crawls
## Client Metadata Captured
- IP Address (with X-Forwarded-For support)
- User-Agent
- Browser name & version
- Operating System
- Device Type (Desktop/Mobile/Tablet)
- Session ID (cookie-based)
- All cookies (JSON)
- Referrer
## Performance
- Concurrent crawling with goroutines
- Configurable concurrency limit (default: 5 parallel requests)
- Depth-limited to prevent infinite crawls
- Same-domain restriction
- Duplicate URL prevention
- 10-second HTTP timeout per request
## Customization
### Adjust Concurrency
Edit `crawler/crawler.go`:
```go
semaphore := make(chan struct{}, 10) // Increase to 10 concurrent
```
### Change Priority Calculation
Modify `calculatePriority()` in `crawler/crawler.go`
### Add Custom Metadata
Extend `models.Site` struct and database schema
## Production Deployment
### Recommendations:
1. Use reverse proxy (nginx/caddy)
2. Enable HTTPS
3. Add rate limiting
4. Configure CORS properly
5. Use PostgreSQL for production (replace SQLite)
6. Add authentication
7. Implement cleanup jobs for old sitemaps
### Example nginx config:
```nginx
location / {
proxy_pass http://localhost:8080;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection 'upgrade';
proxy_set_header Host $host;
proxy_cache_bypass $http_upgrade;
# SSE support
proxy_buffering off;
proxy_cache off;
}
```
## License
MIT
## Support
For issues or questions, please open a GitHub issue.

287
crawler.go Normal file
View File

@@ -0,0 +1,287 @@
package crawler
import (
"fmt"
"net/http"
"net/url"
"sitemap-api/database"
"sitemap-api/models"
"strings"
"sync"
"time"
"golang.org/x/net/html"
)
type Crawler struct {
db *database.DB
maxDepth int
visited map[string]bool
mu sync.Mutex
baseURL *url.URL
client *http.Client
eventChan chan models.Event
uuid string
siteID int
currentDepth int
totalPages int
}
func NewCrawler(db *database.DB) *Crawler {
return &Crawler{
db: db,
client: &http.Client{
Timeout: 10 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
},
}
}
func (c *Crawler) Crawl(uuid string, startURL string, maxDepth int, eventChan chan models.Event) {
c.uuid = uuid
c.maxDepth = maxDepth
c.eventChan = eventChan
c.visited = make(map[string]bool)
c.totalPages = 0
// Parse base URL
parsedURL, err := url.Parse(startURL)
if err != nil {
c.sendEvent("error", models.ErrorData{
UUID: uuid,
Error: fmt.Sprintf("Invalid URL: %v", err),
})
return
}
c.baseURL = parsedURL
// Get site from database
site, err := c.db.GetSiteByUUID(uuid)
if err != nil {
c.sendEvent("error", models.ErrorData{
UUID: uuid,
Error: fmt.Sprintf("Failed to get site: %v", err),
})
return
}
c.siteID = site.ID
// Send started event
c.sendEvent("started", map[string]interface{}{
"uuid": uuid,
"url": startURL,
"max_depth": maxDepth,
})
// Start crawling from root
c.crawlURL(startURL, 0)
// Mark as completed
err = c.db.UpdateSiteStatus(uuid, "completed", c.totalPages)
if err != nil {
c.sendEvent("error", models.ErrorData{
UUID: uuid,
Error: fmt.Sprintf("Failed to update status: %v", err),
})
return
}
// Send completion event
c.sendEvent("complete", models.CompleteData{
UUID: uuid,
PagesFound: c.totalPages,
SiteID: c.siteID,
DownloadURL: fmt.Sprintf("/download/%s", uuid),
})
}
func (c *Crawler) crawlURL(urlStr string, depth int) {
// Check depth limit
if depth > c.maxDepth {
return
}
// Normalize URL
normalizedURL := c.normalizeURL(urlStr)
if normalizedURL == "" {
return
}
// Check if already visited
c.mu.Lock()
if c.visited[normalizedURL] {
c.mu.Unlock()
return
}
c.visited[normalizedURL] = true
c.totalPages++
currentTotal := c.totalPages
c.currentDepth = depth
c.mu.Unlock()
// Send progress event
c.sendEvent("progress", models.ProgressData{
UUID: c.uuid,
PagesFound: currentTotal,
Depth: depth,
CurrentURL: normalizedURL,
})
// Save page to database
priority := c.calculatePriority(depth)
page := &models.Page{
SiteID: c.siteID,
URL: normalizedURL,
Depth: depth,
LastModified: time.Now(),
Priority: priority,
ChangeFreq: "monthly",
}
if err := c.db.AddPage(page); err != nil {
// Log error but continue crawling
fmt.Printf("Failed to save page %s: %v\n", normalizedURL, err)
}
// Fetch the page
resp, err := c.client.Get(normalizedURL)
if err != nil {
return
}
defer resp.Body.Close()
// Only process HTML pages
contentType := resp.Header.Get("Content-Type")
if !strings.Contains(contentType, "text/html") {
return
}
// Parse HTML and extract links
links := c.extractLinks(resp)
// Crawl found links concurrently (with limited concurrency)
var wg sync.WaitGroup
semaphore := make(chan struct{}, 5) // Limit to 5 concurrent requests
for _, link := range links {
if depth+1 <= c.maxDepth {
wg.Add(1)
go func(l string) {
defer wg.Done()
semaphore <- struct{}{} // Acquire
c.crawlURL(l, depth+1)
<-semaphore // Release
}(link)
}
}
wg.Wait()
}
func (c *Crawler) extractLinks(resp *http.Response) []string {
var links []string
tokenizer := html.NewTokenizer(resp.Body)
for {
tokenType := tokenizer.Next()
if tokenType == html.ErrorToken {
break
}
if tokenType == html.StartTagToken {
token := tokenizer.Token()
if token.Data == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" {
link := c.resolveURL(attr.Val)
if link != "" && c.isSameDomain(link) {
links = append(links, link)
}
}
}
}
}
}
return links
}
func (c *Crawler) resolveURL(href string) string {
parsedURL, err := url.Parse(href)
if err != nil {
return ""
}
// Resolve relative URLs
resolvedURL := c.baseURL.ResolveReference(parsedURL)
return resolvedURL.String()
}
func (c *Crawler) normalizeURL(urlStr string) string {
parsedURL, err := url.Parse(urlStr)
if err != nil {
return ""
}
// Remove fragment
parsedURL.Fragment = ""
// Remove trailing slash for consistency
parsedURL.Path = strings.TrimSuffix(parsedURL.Path, "/")
if parsedURL.Path == "" {
parsedURL.Path = "/"
}
return parsedURL.String()
}
func (c *Crawler) isSameDomain(urlStr string) bool {
parsedURL, err := url.Parse(urlStr)
if err != nil {
return false
}
// Check if same host
if parsedURL.Host != c.baseURL.Host {
return false
}
// Skip common non-HTML files
path := strings.ToLower(parsedURL.Path)
skipExtensions := []string{".pdf", ".jpg", ".jpeg", ".png", ".gif", ".css", ".js", ".xml", ".zip", ".tar", ".gz"}
for _, ext := range skipExtensions {
if strings.HasSuffix(path, ext) {
return false
}
}
return true
}
func (c *Crawler) calculatePriority(depth int) float64 {
// Homepage gets highest priority
if depth == 0 {
return 1.0
}
// Decrease priority with depth
priority := 1.0 - (float64(depth) * 0.2)
if priority < 0.3 {
priority = 0.3
}
return priority
}
func (c *Crawler) sendEvent(eventType string, data interface{}) {
if c.eventChan != nil {
select {
case c.eventChan <- models.Event{Type: eventType, Data: data}:
default:
// Channel full or closed, skip event
}
}
}

253
db.go Normal file
View File

@@ -0,0 +1,253 @@
package database
import (
"database/sql"
"fmt"
"sitemap-api/models"
"time"
_ "github.com/mattn/go-sqlite3"
)
type DB struct {
conn *sql.DB
}
func NewDB(dbPath string) (*DB, error) {
conn, err := sql.Open("sqlite3", dbPath)
if err != nil {
return nil, err
}
db := &DB{conn: conn}
if err := db.createTables(); err != nil {
return nil, err
}
return db, nil
}
func (db *DB) Close() error {
return db.conn.Close()
}
func (db *DB) createTables() error {
schema := `
CREATE TABLE IF NOT EXISTS sites (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT UNIQUE NOT NULL,
domain TEXT NOT NULL,
url TEXT NOT NULL,
max_depth INTEGER DEFAULT 3,
page_count INTEGER DEFAULT 0,
status TEXT DEFAULT 'processing',
ip_address TEXT,
user_agent TEXT,
browser TEXT,
browser_version TEXT,
os TEXT,
device_type TEXT,
session_id TEXT,
cookies TEXT,
referrer TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
completed_at DATETIME,
last_crawled DATETIME
);
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
site_id INTEGER NOT NULL,
url TEXT NOT NULL UNIQUE,
depth INTEGER DEFAULT 0,
last_modified DATETIME DEFAULT CURRENT_TIMESTAMP,
priority REAL DEFAULT 0.5,
change_freq TEXT DEFAULT 'monthly',
FOREIGN KEY (site_id) REFERENCES sites(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS sessions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id TEXT UNIQUE NOT NULL,
uuid TEXT,
ip_address TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
last_activity DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (uuid) REFERENCES sites(uuid)
);
CREATE INDEX IF NOT EXISTS idx_uuid ON sites(uuid);
CREATE INDEX IF NOT EXISTS idx_site_pages ON pages(site_id);
CREATE INDEX IF NOT EXISTS idx_session_id ON sessions(session_id);
CREATE INDEX IF NOT EXISTS idx_status ON sites(status);
`
_, err := db.conn.Exec(schema)
return err
}
func (db *DB) CreateSite(site *models.Site) (int, error) {
query := `
INSERT INTO sites (uuid, domain, url, max_depth, status, ip_address,
user_agent, browser, browser_version, os, device_type, session_id,
cookies, referrer, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`
result, err := db.conn.Exec(query,
site.UUID, site.Domain, site.URL, site.MaxDepth, site.Status,
site.IPAddress, site.UserAgent, site.Browser, site.BrowserVersion,
site.OS, site.DeviceType, site.SessionID, site.Cookies, site.Referrer,
time.Now(),
)
if err != nil {
return 0, err
}
id, err := result.LastInsertId()
return int(id), err
}
func (db *DB) GetSiteByUUID(uuid string) (*models.Site, error) {
query := `
SELECT id, uuid, domain, url, max_depth, page_count, status,
ip_address, user_agent, browser, browser_version, os, device_type,
session_id, cookies, referrer, created_at, completed_at, last_crawled
FROM sites WHERE uuid = ?
`
site := &models.Site{}
err := db.conn.QueryRow(query, uuid).Scan(
&site.ID, &site.UUID, &site.Domain, &site.URL, &site.MaxDepth,
&site.PageCount, &site.Status, &site.IPAddress, &site.UserAgent,
&site.Browser, &site.BrowserVersion, &site.OS, &site.DeviceType,
&site.SessionID, &site.Cookies, &site.Referrer, &site.CreatedAt,
&site.CompletedAt, &site.LastCrawled,
)
if err == sql.ErrNoRows {
return nil, fmt.Errorf("site not found")
}
return site, err
}
func (db *DB) GetSiteByID(id int) (*models.Site, error) {
query := `
SELECT id, uuid, domain, url, max_depth, page_count, status,
ip_address, user_agent, browser, browser_version, os, device_type,
session_id, cookies, referrer, created_at, completed_at, last_crawled
FROM sites WHERE id = ?
`
site := &models.Site{}
err := db.conn.QueryRow(query, id).Scan(
&site.ID, &site.UUID, &site.Domain, &site.URL, &site.MaxDepth,
&site.PageCount, &site.Status, &site.IPAddress, &site.UserAgent,
&site.Browser, &site.BrowserVersion, &site.OS, &site.DeviceType,
&site.SessionID, &site.Cookies, &site.Referrer, &site.CreatedAt,
&site.CompletedAt, &site.LastCrawled,
)
if err == sql.ErrNoRows {
return nil, fmt.Errorf("site not found")
}
return site, err
}
func (db *DB) GetAllSites() ([]*models.Site, error) {
query := `
SELECT id, uuid, domain, url, max_depth, page_count, status,
ip_address, user_agent, browser, browser_version, os, device_type,
session_id, cookies, referrer, created_at, completed_at, last_crawled
FROM sites ORDER BY created_at DESC
`
rows, err := db.conn.Query(query)
if err != nil {
return nil, err
}
defer rows.Close()
sites := []*models.Site{}
for rows.Next() {
site := &models.Site{}
err := rows.Scan(
&site.ID, &site.UUID, &site.Domain, &site.URL, &site.MaxDepth,
&site.PageCount, &site.Status, &site.IPAddress, &site.UserAgent,
&site.Browser, &site.BrowserVersion, &site.OS, &site.DeviceType,
&site.SessionID, &site.Cookies, &site.Referrer, &site.CreatedAt,
&site.CompletedAt, &site.LastCrawled,
)
if err != nil {
return nil, err
}
sites = append(sites, site)
}
return sites, nil
}
func (db *DB) UpdateSiteStatus(uuid string, status string, pageCount int) error {
query := `
UPDATE sites
SET status = ?, page_count = ?, completed_at = ?, last_crawled = ?
WHERE uuid = ?
`
now := time.Now()
_, err := db.conn.Exec(query, status, pageCount, now, now, uuid)
return err
}
func (db *DB) DeleteSite(id int) error {
// Pages will be deleted automatically due to CASCADE
_, err := db.conn.Exec("DELETE FROM sites WHERE id = ?", id)
return err
}
func (db *DB) AddPage(page *models.Page) error {
query := `
INSERT OR IGNORE INTO pages (site_id, url, depth, last_modified, priority, change_freq)
VALUES (?, ?, ?, ?, ?, ?)
`
_, err := db.conn.Exec(query,
page.SiteID, page.URL, page.Depth, page.LastModified,
page.Priority, page.ChangeFreq,
)
return err
}
func (db *DB) GetPagesBySiteID(siteID int) ([]*models.Page, error) {
query := `
SELECT id, site_id, url, depth, last_modified, priority, change_freq
FROM pages WHERE site_id = ? ORDER BY depth, url
`
rows, err := db.conn.Query(query, siteID)
if err != nil {
return nil, err
}
defer rows.Close()
pages := []*models.Page{}
for rows.Next() {
page := &models.Page{}
err := rows.Scan(
&page.ID, &page.SiteID, &page.URL, &page.Depth,
&page.LastModified, &page.Priority, &page.ChangeFreq,
)
if err != nil {
return nil, err
}
pages = append(pages, page)
}
return pages, nil
}

11
go.mod Normal file
View File

@@ -0,0 +1,11 @@
module sitemap-api
go 1.21
require (
github.com/go-chi/chi/v5 v5.0.11
github.com/go-chi/cors v1.2.1
github.com/google/uuid v1.5.0
github.com/mattn/go-sqlite3 v1.14.19
golang.org/x/net v0.20.0
)

465
handler.go Normal file
View File

@@ -0,0 +1,465 @@
package handlers
import (
"encoding/json"
"encoding/xml"
"fmt"
"net/http"
"net/url"
"sitemap-api/crawler"
"sitemap-api/database"
"sitemap-api/models"
"strconv"
"strings"
"sync"
"time"
"github.com/go-chi/chi/v5"
"github.com/google/uuid"
)
type Handler struct {
db *database.DB
crawler *crawler.Crawler
streamManager *StreamManager
}
func NewHandler(db *database.DB, streamManager *StreamManager) *Handler {
return &Handler{
db: db,
crawler: crawler.NewCrawler(db),
streamManager: streamManager,
}
}
// StreamManager handles multiple concurrent SSE connections
type StreamManager struct {
mu sync.RWMutex
streams map[string]chan models.Event
}
func NewStreamManager() *StreamManager {
return &StreamManager{
streams: make(map[string]chan models.Event),
}
}
func (sm *StreamManager) CreateStream(uuid string) chan models.Event {
sm.mu.Lock()
defer sm.mu.Unlock()
ch := make(chan models.Event, 100)
sm.streams[uuid] = ch
return ch
}
func (sm *StreamManager) GetStream(uuid string) (chan models.Event, bool) {
sm.mu.RLock()
defer sm.mu.RUnlock()
ch, exists := sm.streams[uuid]
return ch, exists
}
func (sm *StreamManager) CloseStream(uuid string) {
sm.mu.Lock()
defer sm.mu.Unlock()
if ch, exists := sm.streams[uuid]; exists {
close(ch)
delete(sm.streams, uuid)
}
}
// GenerateSitemapXML handles POST /generate-sitemap-xml
func (h *Handler) GenerateSitemapXML(w http.ResponseWriter, r *http.Request) {
var req struct {
URL string `json:"url"`
MaxDepth int `json:"max_depth"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Invalid request body", http.StatusBadRequest)
return
}
// Validate URL
if req.URL == "" {
http.Error(w, "URL is required", http.StatusBadRequest)
return
}
parsedURL, err := url.Parse(req.URL)
if err != nil || parsedURL.Scheme == "" || parsedURL.Host == "" {
http.Error(w, "Invalid URL format", http.StatusBadRequest)
return
}
// Set default max depth
if req.MaxDepth <= 0 || req.MaxDepth > 5 {
req.MaxDepth = 3
}
// Generate UUID server-side
generatedUUID := uuid.New().String()
// Extract client metadata
ip := getClientIP(r)
userAgent := r.Header.Get("User-Agent")
browser, browserVersion := parseBrowser(userAgent)
os := parseOS(userAgent)
deviceType := parseDeviceType(userAgent)
sessionID := getOrCreateSession(r)
cookies := extractCookies(r)
referrer := r.Header.Get("Referer")
// Extract domain from URL
domain := parsedURL.Host
// Create site record
site := &models.Site{
UUID: generatedUUID,
Domain: domain,
URL: req.URL,
MaxDepth: req.MaxDepth,
Status: "processing",
IPAddress: ip,
UserAgent: userAgent,
Browser: browser,
BrowserVersion: browserVersion,
OS: os,
DeviceType: deviceType,
SessionID: sessionID,
Cookies: cookies,
Referrer: referrer,
}
siteID, err := h.db.CreateSite(site)
if err != nil {
http.Error(w, fmt.Sprintf("Failed to create site: %v", err), http.StatusInternalServerError)
return
}
// Create SSE stream for this UUID
eventChan := h.streamManager.CreateStream(generatedUUID)
// Start crawling in background (non-blocking)
go func() {
h.crawler.Crawl(generatedUUID, req.URL, req.MaxDepth, eventChan)
// Close stream after crawl completes
time.Sleep(2 * time.Second) // Give time for final events to be sent
h.streamManager.CloseStream(generatedUUID)
}()
// Return immediately with UUID
response := map[string]interface{}{
"uuid": generatedUUID,
"site_id": siteID,
"status": "processing",
"stream_url": "/stream/" + generatedUUID,
"message": "Sitemap generation started",
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
}
// StreamSSE handles GET /stream/{uuid}
func (h *Handler) StreamSSE(w http.ResponseWriter, r *http.Request) {
uuid := chi.URLParam(r, "uuid")
// Get event channel for this UUID
eventChan, exists := h.streamManager.GetStream(uuid)
if !exists {
http.Error(w, "Stream not found", http.StatusNotFound)
return
}
// Set SSE headers
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
w.Header().Set("Access-Control-Allow-Origin", "*")
flusher, ok := w.(http.Flusher)
if !ok {
http.Error(w, "Streaming unsupported", http.StatusInternalServerError)
return
}
// Send connected event
connectedData := map[string]string{
"uuid": uuid,
"message": "Connected to stream",
}
connectedJSON, _ := json.Marshal(connectedData)
fmt.Fprintf(w, "event: connected\ndata: %s\n\n", connectedJSON)
flusher.Flush()
// Stream events
for event := range eventChan {
data, err := json.Marshal(event.Data)
if err != nil {
continue
}
fmt.Fprintf(w, "event: %s\ndata: %s\n\n", event.Type, data)
flusher.Flush()
}
}
// DownloadSitemap handles GET /download/{uuid}
func (h *Handler) DownloadSitemap(w http.ResponseWriter, r *http.Request) {
uuidParam := chi.URLParam(r, "uuid")
// Get site by UUID
site, err := h.db.GetSiteByUUID(uuidParam)
if err != nil {
http.Error(w, "Sitemap not found", http.StatusNotFound)
return
}
// Get all pages for this site
pages, err := h.db.GetPagesBySiteID(site.ID)
if err != nil {
http.Error(w, "Failed to retrieve pages", http.StatusInternalServerError)
return
}
// Generate XML sitemap
sitemap := generateXMLSitemap(pages)
// Set headers
filename := fmt.Sprintf("sitemap-%s.xml", strings.ReplaceAll(site.Domain, ".", "-"))
w.Header().Set("Content-Type", "application/xml; charset=utf-8")
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", filename))
w.Header().Set("X-Generated-At", time.Now().Format(time.RFC3339))
// Write XML
w.Write([]byte(xml.Header))
w.Write([]byte(sitemap))
}
// GetSites handles GET /sites
func (h *Handler) GetSites(w http.ResponseWriter, r *http.Request) {
sites, err := h.db.GetAllSites()
if err != nil {
http.Error(w, "Failed to retrieve sites", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(sites)
}
// GetSite handles GET /sites/{id}
func (h *Handler) GetSite(w http.ResponseWriter, r *http.Request) {
idParam := chi.URLParam(r, "id")
id, err := strconv.Atoi(idParam)
if err != nil {
http.Error(w, "Invalid site ID", http.StatusBadRequest)
return
}
site, err := h.db.GetSiteByID(id)
if err != nil {
http.Error(w, "Site not found", http.StatusNotFound)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(site)
}
// DeleteSite handles DELETE /sites/{id}
func (h *Handler) DeleteSite(w http.ResponseWriter, r *http.Request) {
idParam := chi.URLParam(r, "id")
id, err := strconv.Atoi(idParam)
if err != nil {
http.Error(w, "Invalid site ID", http.StatusBadRequest)
return
}
if err := h.db.DeleteSite(id); err != nil {
http.Error(w, "Failed to delete site", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"success": true,
"message": "Site deleted successfully",
})
}
// Health handles GET /health
func (h *Handler) Health(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{
"status": "healthy",
"time": time.Now().Format(time.RFC3339),
})
}
// Helper functions
func getClientIP(r *http.Request) string {
// Check X-Forwarded-For header first
forwarded := r.Header.Get("X-Forwarded-For")
if forwarded != "" {
// Get first IP if multiple
ips := strings.Split(forwarded, ",")
return strings.TrimSpace(ips[0])
}
// Check X-Real-IP header
realIP := r.Header.Get("X-Real-IP")
if realIP != "" {
return realIP
}
// Fallback to RemoteAddr
ip := r.RemoteAddr
if strings.Contains(ip, ":") {
ip = strings.Split(ip, ":")[0]
}
return ip
}
func parseBrowser(userAgent string) (string, string) {
ua := strings.ToLower(userAgent)
browsers := map[string]string{
"edg": "Edge",
"chrome": "Chrome",
"firefox": "Firefox",
"safari": "Safari",
"opera": "Opera",
}
for key, name := range browsers {
if strings.Contains(ua, key) {
// Extract version
version := extractVersion(ua, key)
return name, version
}
}
return "Unknown", ""
}
func extractVersion(ua, browser string) string {
idx := strings.Index(ua, browser)
if idx == -1 {
return ""
}
versionStart := idx + len(browser)
if versionStart >= len(ua) {
return ""
}
// Skip forward to version number
for versionStart < len(ua) && (ua[versionStart] == '/' || ua[versionStart] == ' ') {
versionStart++
}
versionEnd := versionStart
for versionEnd < len(ua) && (ua[versionEnd] >= '0' && ua[versionEnd] <= '9' || ua[versionEnd] == '.') {
versionEnd++
}
return ua[versionStart:versionEnd]
}
func parseOS(userAgent string) string {
ua := strings.ToLower(userAgent)
oses := []struct {
keyword string
name string
}{
{"windows nt 10", "Windows 10"},
{"windows nt 11", "Windows 11"},
{"mac os x", "macOS"},
{"android", "Android"},
{"iphone", "iOS"},
{"ipad", "iOS"},
{"linux", "Linux"},
}
for _, os := range oses {
if strings.Contains(ua, os.keyword) {
return os.name
}
}
return "Unknown"
}
func parseDeviceType(userAgent string) string {
ua := strings.ToLower(userAgent)
if strings.Contains(ua, "mobile") || strings.Contains(ua, "android") || strings.Contains(ua, "iphone") {
return "Mobile"
}
if strings.Contains(ua, "tablet") || strings.Contains(ua, "ipad") {
return "Tablet"
}
return "Desktop"
}
func getOrCreateSession(r *http.Request) string {
// Try to get existing session from cookie
cookie, err := r.Cookie("session_id")
if err == nil && cookie.Value != "" {
return cookie.Value
}
// Generate new session ID
return uuid.New().String()
}
func extractCookies(r *http.Request) string {
cookies := r.Cookies()
if len(cookies) == 0 {
return ""
}
cookieData := make(map[string]string)
for _, cookie := range cookies {
cookieData[cookie.Name] = cookie.Value
}
data, _ := json.Marshal(cookieData)
return string(data)
}
func generateXMLSitemap(pages []*models.Page) string {
var sb strings.Builder
sb.WriteString("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n")
for _, page := range pages {
sb.WriteString(" <url>\n")
sb.WriteString(fmt.Sprintf(" <loc>%s</loc>\n", xmlEscape(page.URL)))
sb.WriteString(fmt.Sprintf(" <lastmod>%s</lastmod>\n", page.LastModified.Format("2006-01-02")))
sb.WriteString(fmt.Sprintf(" <changefreq>%s</changefreq>\n", page.ChangeFreq))
sb.WriteString(fmt.Sprintf(" <priority>%.1f</priority>\n", page.Priority))
sb.WriteString(" </url>\n")
}
sb.WriteString("</urlset>")
return sb.String()
}
func xmlEscape(s string) string {
s = strings.ReplaceAll(s, "&", "&amp;")
s = strings.ReplaceAll(s, "<", "&lt;")
s = strings.ReplaceAll(s, ">", "&gt;")
s = strings.ReplaceAll(s, "\"", "&quot;")
s = strings.ReplaceAll(s, "'", "&apos;")
return s
}

726
index.html Normal file
View File

@@ -0,0 +1,726 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Sitemap Generator</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
min-height: 100vh;
padding: 20px;
}
.container {
max-width: 1200px;
margin: 0 auto;
background: white;
border-radius: 12px;
box-shadow: 0 20px 40px rgba(0,0,0,0.1);
overflow: hidden;
}
.header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 30px;
text-align: center;
}
.header h1 {
font-size: 2.5rem;
margin-bottom: 10px;
}
.header p {
opacity: 0.9;
font-size: 1.1rem;
}
.main {
padding: 40px;
}
.form-section {
background: #f8f9fa;
padding: 30px;
border-radius: 8px;
margin-bottom: 30px;
}
.form-group {
margin-bottom: 20px;
}
label {
display: block;
margin-bottom: 8px;
font-weight: 600;
color: #333;
}
input[type="text"], input[type="number"] {
width: 100%;
padding: 12px;
border: 2px solid #e9ecef;
border-radius: 6px;
font-size: 1rem;
transition: border-color 0.3s;
}
input[type="text"]:focus, input[type="number"]:focus {
outline: none;
border-color: #667eea;
}
.btn {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
border: none;
padding: 12px 30px;
border-radius: 6px;
font-size: 1rem;
font-weight: 600;
cursor: pointer;
transition: transform 0.2s, box-shadow 0.2s;
}
.btn:hover {
transform: translateY(-2px);
box-shadow: 0 10px 20px rgba(102, 126, 234, 0.3);
}
.btn:disabled {
background: #6c757d;
cursor: not-allowed;
transform: none;
box-shadow: none;
}
.btn-secondary {
background: #28a745;
margin-left: 10px;
}
.btn-danger {
background: #dc3545;
margin-left: 10px;
}
.progress-section {
display: none;
background: #f8f9fa;
padding: 30px;
border-radius: 8px;
margin-bottom: 30px;
}
.progress-bar {
width: 100%;
height: 20px;
background: #e9ecef;
border-radius: 10px;
overflow: hidden;
margin-bottom: 15px;
}
.progress-fill {
height: 100%;
background: linear-gradient(90deg, #667eea, #764ba2);
width: 0%;
transition: width 0.3s ease;
border-radius: 10px;
}
.status {
padding: 15px;
border-radius: 6px;
margin-bottom: 20px;
font-weight: 500;
}
.status.info {
background: #d1ecf1;
color: rgb(12, 84, 96);
border: 1px solid #bee5eb;
}
.status.success {
background: #d4edda;
color: rgb(21, 87, 36);
border: 1px solid #c3e6cb;
}
.status.error {
background: #f8d7da;
color: rgb(114, 28, 36);
border: 1px solid #f5c6cb;
}
.log-section {
background: #2d3748;
color: #e2e8f0;
padding: 20px;
border-radius: 8px;
font-family: 'Courier New', monospace;
font-size: 0.9rem;
max-height: 400px;
overflow-y: auto;
margin-bottom: 30px;
}
.log-entry {
margin-bottom: 10px;
padding: 8px;
border-radius: 4px;
}
.log-entry.start {
background: rgba(102, 126, 234, 0.2);
border-left: 3px solid #667eea;
}
.log-entry.progress {
background: rgba(40, 167, 69, 0.1);
border-left: 3px solid #28a745;
}
.log-entry.complete {
background: rgba(25, 135, 84, 0.1);
border-left: 3px solid #198754;
}
.log-entry.error {
background: rgba(220, 53, 69, 0.1);
border-left: 3px solid #dc3545;
}
.results-section {
display: none;
background: #f8f9fa;
padding: 30px;
border-radius: 8px;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 30px;
}
.stat-card {
background: white;
padding: 20px;
border-radius: 8px;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
text-align: center;
}
.stat-number {
font-size: 2rem;
font-weight: bold;
color: #667eea;
}
.stat-label {
color: #6c757d;
margin-top: 5px;
}
.sites-list {
background: white;
border-radius: 8px;
overflow: hidden;
}
.site-item {
padding: 20px;
border-bottom: 1px solid #e9ecef;
display: flex;
justify-content: space-between;
align-items: center;
}
.site-item:last-child {
border-bottom: none;
}
.site-domain {
font-weight: 600;
color: #333;
margin-bottom: 5px;
}
.site-meta {
color: #6c757d;
font-size: 0.9rem;
}
.connection-status {
display: inline-block;
padding: 5px 10px;
border-radius: 4px;
font-size: 0.9rem;
margin-bottom: 15px;
}
.connection-status.connected {
background: #d4edda;
color: #155724;
}
.connection-status.disconnected {
background: #f8d7da;
color: #721c24;
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>🗺️ XML Sitemap Generator</h1>
<p>Generate sitemaps for your websites with real-time progress tracking</p>
</div>
<div class="main">
<!-- Input Form -->
<div class="form-section">
<div class="form-group">
<label for="siteUrl">Website URL</label>
<input type="text" id="siteUrl" placeholder="https://example.com" value="https://example.com">
</div>
<div class="form-group">
<label for="maxDepth">Max Crawl Depth (1-5)</label>
<input type="number" id="maxDepth" min="1" max="5" value="3">
</div>
<button class="btn" id="generateBtn" onclick="sitemapGen.generateSitemap()">
🚀 Generate Sitemap
</button>
</div>
<!-- Progress Section -->
<div class="progress-section" id="progressSection">
<div id="connectionStatus" class="connection-status disconnected">🔴 Disconnected</div>
<div class="progress-bar">
<div class="progress-fill" id="progressFill"></div>
</div>
<div class="status info" id="statusMessage">
Initializing...
</div>
<div class="stats-grid">
<div class="stat-card">
<div class="stat-number" id="totalPages">0</div>
<div class="stat-label">Pages Found</div>
</div>
<div class="stat-card">
<div class="stat-number" id="currentDepth">0</div>
<div class="stat-label">Current Depth</div>
</div>
<div class="stat-card">
<div class="stat-number" id="crawlTime">0s</div>
<div class="stat-label">Crawl Time</div>
</div>
</div>
<div id="currentUrl" style="margin-bottom: 20px; font-size: 0.9rem; color: #6c757d;">
Current: -
</div>
<button class="btn btn-secondary" id="downloadBtn" onclick="sitemapGen.downloadSitemap()" style="display: none;">
📥 Download Sitemap
</button>
<button class="btn btn-danger" onclick="sitemapGen.clearAll()">
🗑️ Clear All
</button>
</div>
<!-- Log Section -->
<div class="log-section" id="logSection" style="display: none;">
<div id="logContainer"></div>
</div>
<!-- Results Section -->
<div class="results-section" id="resultsSection">
<h3 style="margin-bottom: 20px;">Previously Generated Sitemaps</h3>
<div class="sites-list" id="sitesList"></div>
</div>
</div>
</div>
<script>
class SitemapGenerator {
constructor() {
this.currentUUID = null;
this.currentSiteId = null;
this.eventSource = null;
this.startTime = null;
this.crawlTimer = null;
this.totalPages = 0;
// Load existing sites on init
this.loadExistingSites();
}
async generateSitemap() {
let url = document.getElementById('siteUrl').value.trim();
const maxDepth = parseInt(document.getElementById('maxDepth').value);
if (!url) {
alert('Please enter a website URL');
return;
}
// Auto-add protocol if missing
if (!url.startsWith('http://') && !url.startsWith('https://')) {
url = 'https://' + url;
document.getElementById('siteUrl').value = url;
}
try {
this.disableForm();
this.showProgress();
this.addLog(`Starting crawl of ${url}`, 'start');
const response = await fetch('/generate-sitemap-xml', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
credentials: 'include',
body: JSON.stringify({
url: url,
max_depth: maxDepth
})
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const result = await response.json();
this.currentUUID = result.uuid;
this.currentSiteId = result.site_id;
this.addLog(`UUID assigned: ${result.uuid}`, 'start');
this.addLog(`${result.message}`, 'start');
// Connect to stream with UUID
this.connectToStream(result.uuid);
} catch (error) {
this.addLog(`Failed to start crawl: ${error.message}`, 'error');
this.showError(error.message);
this.stopCrawlTimer();
this.enableForm();
}
}
connectToStream(uuid) {
if (this.eventSource) {
this.eventSource.close();
}
this.addLog(`Connecting to stream: /stream/${uuid}`, 'start');
try {
this.eventSource = new EventSource(`/stream/${uuid}`);
this.eventSource.addEventListener('connected', (e) => {
const data = JSON.parse(e.data);
this.addLog(`Connected to stream`, 'progress');
this.updateConnectionStatus(true);
document.getElementById('statusMessage').textContent = '🔄 Crawling...';
});
this.eventSource.addEventListener('started', (e) => {
const data = JSON.parse(e.data);
this.startTime = Date.now();
this.startCrawlTimer();
this.addLog(`Crawl started: ${data.url}`, 'start');
});
this.eventSource.addEventListener('progress', (e) => {
const data = JSON.parse(e.data);
this.updateProgress(data);
if (data.current_url) {
this.addLog(`Depth ${data.depth}: ${data.current_url}`, 'progress');
}
});
this.eventSource.addEventListener('complete', (e) => {
const data = JSON.parse(e.data);
this.completeGeneration(data);
this.addLog(`✅ Complete! Found ${data.pages_found} pages`, 'complete');
this.showDownloadButton();
this.eventSource.close();
this.stopCrawlTimer();
this.loadExistingSites();
});
this.eventSource.addEventListener('error', (e) => {
if (e.data) {
try {
const data = JSON.parse(e.data);
this.showError(data.error);
this.addLog(`❌ Error: ${data.error}`, 'error');
} catch (err) {
this.addLog('Stream error occurred', 'error');
}
}
this.eventSource.close();
this.updateConnectionStatus(false);
this.enableForm();
this.stopCrawlTimer();
});
this.eventSource.onerror = (e) => {
this.updateConnectionStatus(false);
this.addLog('Stream connection lost', 'error');
};
} catch (error) {
this.addLog(`Failed to connect to stream: ${error.message}`, 'error');
}
}
startCrawlTimer() {
this.stopCrawlTimer();
this.crawlTimer = setInterval(() => {
if (this.startTime) {
const elapsed = Math.floor((Date.now() - this.startTime) / 1000);
const element = document.getElementById('crawlTime');
if (element) {
element.textContent = `${elapsed}s`;
}
}
}, 1000);
}
stopCrawlTimer() {
if (this.crawlTimer) {
clearInterval(this.crawlTimer);
this.crawlTimer = null;
}
}
updateProgress(data) {
if (data.pages_found !== undefined) {
this.totalPages = data.pages_found;
document.getElementById('totalPages').textContent = data.pages_found;
}
if (data.depth !== undefined) {
document.getElementById('currentDepth').textContent = data.depth;
}
if (data.current_url) {
document.getElementById('currentUrl').textContent = `Current: ${data.current_url}`;
}
// Update progress bar (estimated)
const progress = Math.min((this.totalPages / 100) * 100, 90);
document.getElementById('progressFill').style.width = `${progress}%`;
}
completeGeneration(data) {
this.enableForm();
document.getElementById('progressFill').style.width = '100%';
document.getElementById('statusMessage').className = 'status success';
document.getElementById('statusMessage').textContent = `✅ Complete! Found ${data.pages_found || this.totalPages} pages`;
if (this.startTime) {
const elapsed = Math.floor((Date.now() - this.startTime) / 1000);
document.getElementById('crawlTime').textContent = `${elapsed}s`;
}
}
showDownloadButton() {
document.getElementById('downloadBtn').style.display = 'inline-block';
}
async downloadSitemap() {
if (!this.currentUUID) {
// Try to get the latest site
await this.loadExistingSites();
}
if (this.currentUUID) {
window.open(`/download/${this.currentUUID}`, '_blank');
} else {
alert('No sitemap available for download');
}
}
async loadExistingSites() {
try {
const response = await fetch('/sites');
if (!response.ok) return;
const sites = await response.json();
this.displaySites(sites);
if (sites.length > 0) {
this.currentUUID = sites[0].uuid;
this.currentSiteId = sites[0].id;
}
} catch (error) {
console.error('Failed to load sites:', error);
}
}
displaySites(sites) {
const container = document.getElementById('sitesList');
container.innerHTML = '';
if (sites.length === 0) {
container.innerHTML = '<p>No sitemaps generated yet.</p>';
document.getElementById('resultsSection').style.display = 'none';
return;
}
sites.forEach(site => {
const siteDiv = document.createElement('div');
siteDiv.className = 'site-item';
const createdDate = new Date(site.created_at).toLocaleString();
siteDiv.innerHTML = `
<div class="site-info">
<div class="site-domain">${site.domain}</div>
<div class="site-meta">
${site.page_count} pages •
Status: ${site.status}
Created: ${createdDate}
</div>
</div>
<div>
<button class="btn btn-secondary" onclick="sitemapGen.downloadSiteSitemap('${site.uuid}')">
📥 Download
</button>
<button class="btn btn-danger" onclick="sitemapGen.deleteSite(${site.id})">
🗑️ Delete
</button>
</div>
`;
container.appendChild(siteDiv);
});
document.getElementById('resultsSection').style.display = 'block';
}
async downloadSiteSitemap(uuid) {
window.open(`/download/${uuid}`, '_blank');
}
async deleteSite(siteId) {
if (!confirm('Are you sure you want to delete this sitemap?')) return;
try {
const response = await fetch(`/sites/${siteId}`, {
method: 'DELETE'
});
if (response.ok) {
this.addLog(`Site ${siteId} deleted`, 'complete');
this.loadExistingSites();
} else {
throw new Error('Failed to delete site');
}
} catch (error) {
this.addLog(`Delete failed: ${error.message}`, 'error');
}
}
async clearAll() {
if (!confirm('Are you sure you want to clear all data? This cannot be undone.')) return;
try {
this.stopCrawlTimer();
const sitesResponse = await fetch('/sites');
if (sitesResponse.ok) {
const sites = await sitesResponse.json();
for (const site of sites) {
await fetch(`/sites/${site.id}`, {
method: 'DELETE'
});
}
}
this.addLog('All data cleared', 'complete');
this.loadExistingSites();
this.hideProgress();
} catch (error) {
this.addLog(`Clear failed: ${error.message}`, 'error');
}
}
showProgress() {
document.getElementById('progressSection').style.display = 'block';
document.getElementById('logSection').style.display = 'block';
}
hideProgress() {
document.getElementById('progressSection').style.display = 'none';
}
showError(message) {
document.getElementById('statusMessage').className = 'status error';
document.getElementById('statusMessage').textContent = `❌ Error: ${message}`;
}
addLog(message, type = 'info') {
const container = document.getElementById('logContainer');
const entry = document.createElement('div');
entry.className = `log-entry ${type}`;
const timestamp = new Date().toLocaleTimeString();
entry.innerHTML = `<strong>[${timestamp}]</strong> ${message}`;
container.prepend(entry);
container.scrollTop = 0;
}
updateConnectionStatus(connected) {
const status = document.getElementById('connectionStatus');
if (connected) {
status.className = 'connection-status connected';
status.textContent = '🟢 Connected';
} else {
status.className = 'connection-status disconnected';
status.textContent = '🔴 Disconnected';
}
}
disableForm() {
document.getElementById('generateBtn').disabled = true;
document.getElementById('generateBtn').textContent = '🔄 Generating...';
document.getElementById('siteUrl').disabled = true;
document.getElementById('maxDepth').disabled = true;
}
enableForm() {
document.getElementById('generateBtn').disabled = false;
document.getElementById('generateBtn').textContent = '🚀 Generate Sitemap';
document.getElementById('siteUrl').disabled = false;
document.getElementById('maxDepth').disabled = false;
}
}
// Initialize the application
const sitemapGen = new SitemapGenerator();
</script>
</body>
</html>

72
main.go Normal file
View File

@@ -0,0 +1,72 @@
package main
import (
"log"
"net/http"
"os"
"sitemap-api/database"
"sitemap-api/handlers"
"github.com/go-chi/chi/v5"
"github.com/go-chi/chi/v5/middleware"
"github.com/go-chi/cors"
)
func main() {
// Initialize database
db, err := database.NewDB("sitemap.db")
if err != nil {
log.Fatal("Failed to initialize database:", err)
}
defer db.Close()
// Initialize stream manager
streamManager := handlers.NewStreamManager()
// Initialize handler
h := handlers.NewHandler(db, streamManager)
// Setup router
r := chi.NewRouter()
// Middleware
r.Use(middleware.Logger)
r.Use(middleware.Recoverer)
r.Use(middleware.RealIP)
r.Use(cors.Handler(cors.Options{
AllowedOrigins: []string{"https://*", "http://*"},
AllowedMethods: []string{"GET", "POST", "PUT", "DELETE", "OPTIONS"},
AllowedHeaders: []string{"Accept", "Authorization", "Content-Type"},
ExposedHeaders: []string{"Link"},
AllowCredentials: true,
MaxAge: 300,
}))
// Serve static HTML at root
r.Get("/", func(w http.ResponseWriter, r *http.Request) {
http.ServeFile(w, r, "static/index.html")
})
// API Routes
r.Post("/generate-sitemap-xml", h.GenerateSitemapXML)
r.Get("/stream/{uuid}", h.StreamSSE)
r.Get("/download/{uuid}", h.DownloadSitemap)
r.Get("/sites", h.GetSites)
r.Get("/sites/{id}", h.GetSite)
r.Delete("/sites/{id}", h.DeleteSite)
r.Get("/health", h.Health)
// Get port from environment or use default
port := os.Getenv("PORT")
if port == "" {
port = "8080"
}
log.Printf("Server starting on port %s...", port)
log.Printf("Visit http://localhost:%s to use the sitemap generator", port)
if err := http.ListenAndServe(":"+port, r); err != nil {
log.Fatal("Server failed to start:", err)
}
}

44
run.sh Normal file
View File

@@ -0,0 +1,44 @@
#!/bin/bash
echo "🗺️ XML Sitemap Generator API"
echo "=============================="
echo ""
# Check if Go is installed
if ! command -v go &> /dev/null; then
echo "❌ Error: Go is not installed"
echo "Please install Go 1.21+ from https://golang.org/dl/"
exit 1
fi
echo "✅ Go version: $(go version)"
echo ""
# Install dependencies
echo "📦 Installing dependencies..."
go mod download
if [ $? -ne 0 ]; then
echo "❌ Failed to download dependencies"
exit 1
fi
echo "✅ Dependencies installed"
echo ""
# Build the application
echo "🔨 Building application..."
go build -o sitemap-api .
if [ $? -ne 0 ]; then
echo "❌ Build failed"
exit 1
fi
echo "✅ Build successful"
echo ""
# Run the application
echo "🚀 Starting server..."
echo ""
echo "Server will start on http://localhost:8080"
echo "Press Ctrl+C to stop"
echo ""
./sitemap-api

59
site.go Normal file
View File

@@ -0,0 +1,59 @@
package models
import "time"
type Site struct {
ID int `json:"id"`
UUID string `json:"uuid"`
Domain string `json:"domain"`
URL string `json:"url"`
MaxDepth int `json:"max_depth"`
PageCount int `json:"page_count"`
Status string `json:"status"` // processing, completed, failed
IPAddress string `json:"ip_address"`
UserAgent string `json:"user_agent"`
Browser string `json:"browser"`
BrowserVersion string `json:"browser_version"`
OS string `json:"os"`
DeviceType string `json:"device_type"`
SessionID string `json:"session_id"`
Cookies string `json:"cookies"`
Referrer string `json:"referrer"`
CreatedAt time.Time `json:"created_at"`
CompletedAt *time.Time `json:"completed_at,omitempty"`
LastCrawled *time.Time `json:"last_crawled,omitempty"`
}
type Page struct {
ID int `json:"id"`
SiteID int `json:"site_id"`
URL string `json:"url"`
Depth int `json:"depth"`
LastModified time.Time `json:"last_modified"`
Priority float64 `json:"priority"`
ChangeFreq string `json:"change_freq"`
}
type Event struct {
Type string `json:"type"`
Data interface{} `json:"data"`
}
type ProgressData struct {
UUID string `json:"uuid"`
PagesFound int `json:"pages_found"`
Depth int `json:"depth"`
CurrentURL string `json:"current_url"`
}
type CompleteData struct {
UUID string `json:"uuid"`
PagesFound int `json:"pages_found"`
SiteID int `json:"site_id"`
DownloadURL string `json:"download_url"`
}
type ErrorData struct {
UUID string `json:"uuid"`
Error string `json:"error"`
}