init
This commit is contained in:
38
Dockerfile
Normal file
38
Dockerfile
Normal file
@@ -0,0 +1,38 @@
|
||||
# Build stage
|
||||
FROM golang:1.21-alpine AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apk add --no-cache git gcc musl-dev sqlite-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy go mod files
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Build the application
|
||||
RUN CGO_ENABLED=1 GOOS=linux go build -a -installsuffix cgo -o sitemap-api .
|
||||
|
||||
# Final stage
|
||||
FROM alpine:latest
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apk --no-cache add ca-certificates sqlite-libs
|
||||
|
||||
WORKDIR /root/
|
||||
|
||||
# Copy binary from builder
|
||||
COPY --from=builder /app/sitemap-api .
|
||||
COPY --from=builder /app/static ./static
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8080
|
||||
|
||||
# Set environment
|
||||
ENV PORT=8080
|
||||
|
||||
# Run the application
|
||||
CMD ["./sitemap-api"]
|
||||
61
Makefile
Normal file
61
Makefile
Normal file
@@ -0,0 +1,61 @@
|
||||
.PHONY: help build run clean test install dev
|
||||
|
||||
help: ## Show this help message
|
||||
@echo "XML Sitemap Generator API - Make Commands"
|
||||
@echo ""
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m %s\n", $$1, $$2}'
|
||||
|
||||
install: ## Install Go dependencies
|
||||
@echo "📦 Installing dependencies..."
|
||||
@go mod download
|
||||
@echo "✅ Dependencies installed"
|
||||
|
||||
build: ## Build the application
|
||||
@echo "🔨 Building..."
|
||||
@go build -o sitemap-api .
|
||||
@echo "✅ Build complete: ./sitemap-api"
|
||||
|
||||
run: build ## Build and run the application
|
||||
@echo "🚀 Starting server on http://localhost:8080"
|
||||
@./sitemap-api
|
||||
|
||||
dev: ## Run in development mode (with hot reload if air is installed)
|
||||
@if command -v air > /dev/null; then \
|
||||
air; \
|
||||
else \
|
||||
echo "💡 Tip: Install 'air' for hot reload: go install github.com/cosmtrek/air@latest"; \
|
||||
$(MAKE) run; \
|
||||
fi
|
||||
|
||||
clean: ## Clean build artifacts and database
|
||||
@echo "🧹 Cleaning..."
|
||||
@rm -f sitemap-api
|
||||
@rm -f *.db
|
||||
@rm -f *.db-journal
|
||||
@echo "✅ Clean complete"
|
||||
|
||||
test: ## Run tests
|
||||
@echo "🧪 Running tests..."
|
||||
@go test -v ./...
|
||||
|
||||
format: ## Format code
|
||||
@echo "📝 Formatting code..."
|
||||
@go fmt ./...
|
||||
@echo "✅ Code formatted"
|
||||
|
||||
lint: ## Run linter (requires golangci-lint)
|
||||
@echo "🔍 Running linter..."
|
||||
@if command -v golangci-lint > /dev/null; then \
|
||||
golangci-lint run; \
|
||||
else \
|
||||
echo "❌ golangci-lint not installed. Install: https://golangci-lint.run/usage/install/"; \
|
||||
fi
|
||||
|
||||
docker-build: ## Build Docker image
|
||||
@echo "🐳 Building Docker image..."
|
||||
@docker build -t sitemap-api .
|
||||
@echo "✅ Docker image built: sitemap-api"
|
||||
|
||||
docker-run: docker-build ## Run in Docker container
|
||||
@echo "🐳 Running in Docker..."
|
||||
@docker run -p 8080:8080 sitemap-api
|
||||
447
PROJECT_OVERVIEW.md
Normal file
447
PROJECT_OVERVIEW.md
Normal file
@@ -0,0 +1,447 @@
|
||||
# 🗺️ XML Sitemap Generator - Complete Implementation
|
||||
|
||||
## Project Overview
|
||||
|
||||
A production-ready Go API for generating XML sitemaps with real-time progress tracking. Built with concurrent crawling, SSE streaming, and comprehensive client metadata tracking.
|
||||
|
||||
## ✨ Key Features Implemented
|
||||
|
||||
### 1. **Backend-Generated UUID System**
|
||||
- Server generates unique UUID for each crawl request
|
||||
- UUID used for SSE stream connection and file download
|
||||
- Enables true multi-user support with isolated streams
|
||||
|
||||
### 2. **Server-Sent Events (SSE) Streaming**
|
||||
- Real-time progress updates via `/stream/{uuid}`
|
||||
- Event types: `connected`, `started`, `progress`, `complete`, `error`
|
||||
- Non-blocking concurrent stream management
|
||||
- Automatic cleanup after completion
|
||||
|
||||
### 3. **Concurrent Web Crawler**
|
||||
- Goroutine-based parallel crawling
|
||||
- Configurable concurrency limit (default: 5 parallel requests)
|
||||
- Depth-limited crawling (1-5 levels)
|
||||
- Same-domain restriction with URL normalization
|
||||
- Duplicate detection and prevention
|
||||
|
||||
### 4. **Client Metadata Tracking**
|
||||
Automatically captured and stored in SQLite:
|
||||
- IP Address (with X-Forwarded-For support)
|
||||
- User-Agent string
|
||||
- Browser name & version (Chrome, Firefox, Safari, Edge, Opera)
|
||||
- Operating System (Windows, macOS, Linux, Android, iOS)
|
||||
- Device Type (Desktop, Mobile, Tablet)
|
||||
- Session ID (cookie-based persistence)
|
||||
- All cookies (JSON-encoded)
|
||||
- HTTP Referrer
|
||||
|
||||
### 5. **RESTful API Endpoints**
|
||||
```
|
||||
POST /generate-sitemap-xml → Start crawl, returns UUID
|
||||
GET /stream/{uuid} → SSE progress stream
|
||||
GET /download/{uuid} → Download XML sitemap
|
||||
GET /sites → List all sitemaps
|
||||
GET /sites/{id} → Get specific site
|
||||
DELETE /sites/{id} → Delete sitemap
|
||||
GET /health → Health check
|
||||
GET / → Serve frontend HTML
|
||||
```
|
||||
|
||||
### 6. **Beautiful Frontend UI**
|
||||
- Responsive gradient design
|
||||
- Real-time progress visualization
|
||||
- Live connection status indicator
|
||||
- Crawl statistics (pages found, depth, time)
|
||||
- Activity log with color-coded entries
|
||||
- Site management (view, download, delete)
|
||||
- Auto-protocol addition for URLs
|
||||
|
||||
## 🏗️ Architecture
|
||||
|
||||
```
|
||||
┌─────────────┐
|
||||
│ Browser │
|
||||
│ (Frontend) │
|
||||
└──────┬──────┘
|
||||
│ POST /generate-sitemap-xml
|
||||
↓
|
||||
┌──────────────────────────────────┐
|
||||
│ Go HTTP Server (Chi Router) │
|
||||
│ │
|
||||
│ ┌────────────────────────────┐ │
|
||||
│ │ Handler (handler.go) │ │
|
||||
│ │ - Generate UUID │ │
|
||||
│ │ - Extract metadata │ │
|
||||
│ │ - Create DB record │ │
|
||||
│ │ - Spawn crawler │ │
|
||||
│ │ - Return UUID immediately│ │
|
||||
│ └─────────────┬──────────────┘ │
|
||||
└────────────────┼────────────────┘
|
||||
│
|
||||
┌─────────┴─────────┐
|
||||
│ │
|
||||
↓ ↓
|
||||
┌──────────────┐ ┌───────────────┐
|
||||
│ StreamManager│ │ Crawler │
|
||||
│ │ │ │
|
||||
│ UUID → Chan │ │ Goroutines │
|
||||
│ Map storage │←──│ Concurrent │
|
||||
│ │ │ HTTP requests│
|
||||
└──────┬───────┘ └───────┬───────┘
|
||||
│ │
|
||||
│ SSE Events │ Save pages
|
||||
↓ ↓
|
||||
┌──────────────────────────────────┐
|
||||
│ SQLite Database │
|
||||
│ - sites (with metadata) │
|
||||
│ - pages (discovered URLs) │
|
||||
│ - sessions (tracking) │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
|
||||
## 📂 File Structure
|
||||
|
||||
```
|
||||
sitemap-api/
|
||||
├── main.go # HTTP server setup, routes
|
||||
├── go.mod # Go module dependencies
|
||||
├── go.sum # Dependency checksums
|
||||
│
|
||||
├── handlers/
|
||||
│ └── handler.go # All HTTP handlers
|
||||
│ - GenerateSitemapXML # POST endpoint
|
||||
│ - StreamSSE # SSE streaming
|
||||
│ - DownloadSitemap # XML generation
|
||||
│ - GetSites/GetSite # CRUD operations
|
||||
│ - DeleteSite # Cleanup
|
||||
│ - StreamManager # Concurrent stream management
|
||||
│
|
||||
├── crawler/
|
||||
│ └── crawler.go # Web crawler implementation
|
||||
│ - Crawl() # Main crawl logic
|
||||
│ - crawlURL() # Recursive URL processing
|
||||
│ - extractLinks() # HTML parsing
|
||||
│ - normalizeURL() # URL canonicalization
|
||||
│ - isSameDomain() # Domain checking
|
||||
│ - calculatePriority() # Sitemap priority
|
||||
│
|
||||
├── database/
|
||||
│ └── db.go # SQLite operations
|
||||
│ - NewDB() # Initialize DB
|
||||
│ - createTables() # Schema creation
|
||||
│ - CreateSite() # Insert site record
|
||||
│ - GetSiteByUUID() # Retrieve by UUID
|
||||
│ - UpdateSiteStatus() # Mark complete
|
||||
│ - AddPage() # Save discovered page
|
||||
│ - GetPagesBySiteID() # Retrieve all pages
|
||||
│ - DeleteSite() # Cascade delete
|
||||
│
|
||||
├── models/
|
||||
│ └── site.go # Data structures
|
||||
│ - Site # Site record
|
||||
│ - Page # Page record
|
||||
│ - Event # SSE event
|
||||
│ - ProgressData # Progress payload
|
||||
│ - CompleteData # Completion payload
|
||||
│ - ErrorData # Error payload
|
||||
│
|
||||
├── static/
|
||||
│ └── index.html # Frontend application
|
||||
│ - SitemapGenerator # Main class
|
||||
│ - generateSitemap() # Initiate crawl
|
||||
│ - connectToStream() # SSE connection
|
||||
│ - updateProgress() # Live updates
|
||||
│ - downloadSitemap() # File download
|
||||
│ - displaySites() # Results listing
|
||||
│
|
||||
├── README.md # Full documentation
|
||||
├── QUICKSTART.md # Quick start guide
|
||||
├── Makefile # Build automation
|
||||
├── Dockerfile # Container setup
|
||||
├── run.sh # Startup script
|
||||
├── .gitignore # Git exclusions
|
||||
└── .env.example # Environment template
|
||||
```
|
||||
|
||||
## 🔄 Request Flow
|
||||
|
||||
### 1. Generate Sitemap Request
|
||||
```
|
||||
User fills form → POST /generate-sitemap-xml
|
||||
↓
|
||||
Server generates UUID
|
||||
↓
|
||||
Extract IP, UA, cookies, session
|
||||
↓
|
||||
Save to database (status: processing)
|
||||
↓
|
||||
Create SSE channel in StreamManager
|
||||
↓
|
||||
Spawn goroutine for crawler (non-blocking)
|
||||
↓
|
||||
Return UUID immediately to frontend
|
||||
```
|
||||
|
||||
### 2. SSE Stream Connection
|
||||
```
|
||||
Frontend receives UUID → GET /stream/{uuid}
|
||||
↓
|
||||
StreamManager finds channel
|
||||
↓
|
||||
Send "connected" event
|
||||
↓
|
||||
Crawler sends events to channel
|
||||
↓
|
||||
Handler forwards to browser
|
||||
↓
|
||||
Frontend updates UI in real-time
|
||||
```
|
||||
|
||||
### 3. Crawler Operation
|
||||
```
|
||||
Start from root URL → Fetch HTML
|
||||
↓
|
||||
Parse <a> tags for links
|
||||
↓
|
||||
Check: same domain? not visited?
|
||||
↓
|
||||
Save page to database (URL, depth, priority)
|
||||
↓
|
||||
Send "progress" event via channel
|
||||
↓
|
||||
Spawn goroutines for child URLs
|
||||
↓
|
||||
Repeat until max depth reached
|
||||
↓
|
||||
Send "complete" event
|
||||
↓
|
||||
Close channel, cleanup resources
|
||||
```
|
||||
|
||||
### 4. Download Request
|
||||
```
|
||||
User clicks download → GET /download/{uuid}
|
||||
↓
|
||||
Lookup site by UUID
|
||||
↓
|
||||
Fetch all pages from database
|
||||
↓
|
||||
Generate XML sitemap
|
||||
↓
|
||||
Set Content-Disposition header
|
||||
↓
|
||||
Stream XML to browser
|
||||
```
|
||||
|
||||
## 🔐 Security Considerations
|
||||
|
||||
### Implemented
|
||||
- ✅ Same-domain restriction (no external crawling)
|
||||
- ✅ Max depth limit (prevents infinite loops)
|
||||
- ✅ HTTP timeout per request (10 seconds)
|
||||
- ✅ Duplicate URL prevention
|
||||
- ✅ SQLite prepared statements (SQL injection safe)
|
||||
- ✅ CORS middleware included
|
||||
|
||||
### Recommended for Production
|
||||
- [ ] Rate limiting per IP
|
||||
- [ ] Authentication/API keys
|
||||
- [ ] Input validation & sanitization
|
||||
- [ ] Request size limits
|
||||
- [ ] robots.txt respect
|
||||
- [ ] User-Agent identification
|
||||
- [ ] HTTPS enforcement
|
||||
- [ ] Firewall rules
|
||||
|
||||
## 🚀 Performance Optimization
|
||||
|
||||
### Current
|
||||
- Concurrent goroutines (5 parallel requests default)
|
||||
- Non-blocking SSE streams
|
||||
- Efficient channel-based communication
|
||||
- In-memory visited URL tracking
|
||||
- Database connection pooling
|
||||
|
||||
### Possible Improvements
|
||||
- Redis for distributed crawling
|
||||
- Worker pool pattern
|
||||
- Content caching
|
||||
- Incremental sitemap updates
|
||||
- Compression for large sitemaps
|
||||
- Database indexing optimization
|
||||
|
||||
## 📊 Database Schema
|
||||
|
||||
### sites table
|
||||
```sql
|
||||
- id (PK) - Auto-increment
|
||||
- uuid (UNIQUE) - Server-generated UUID
|
||||
- domain - Extracted from URL
|
||||
- url - Full starting URL
|
||||
- max_depth - Crawl depth limit
|
||||
- page_count - Total pages found
|
||||
- status - processing/completed/failed
|
||||
- ip_address - Client IP
|
||||
- user_agent - Full UA string
|
||||
- browser - Parsed browser name
|
||||
- browser_version - Version number
|
||||
- os - Operating system
|
||||
- device_type - Desktop/Mobile/Tablet
|
||||
- session_id - Cookie-based session
|
||||
- cookies - JSON of all cookies
|
||||
- referrer - HTTP Referer header
|
||||
- created_at - Timestamp
|
||||
- completed_at - Completion timestamp
|
||||
- last_crawled - Last activity
|
||||
```
|
||||
|
||||
### pages table
|
||||
```sql
|
||||
- id (PK) - Auto-increment
|
||||
- site_id (FK) - References sites(id)
|
||||
- url - Page URL (UNIQUE)
|
||||
- depth - Crawl depth level
|
||||
- last_modified - Discovery time
|
||||
- priority - Sitemap priority (0.0-1.0)
|
||||
- change_freq - monthly/weekly/daily/etc
|
||||
```
|
||||
|
||||
### sessions table
|
||||
```sql
|
||||
- id (PK) - Auto-increment
|
||||
- session_id (UNIQUE) - Session UUID
|
||||
- uuid (FK) - References sites(uuid)
|
||||
- ip_address - Client IP
|
||||
- created_at - First seen
|
||||
- last_activity - Last request
|
||||
```
|
||||
|
||||
## 🧪 Testing
|
||||
|
||||
### Manual Testing
|
||||
```bash
|
||||
# Terminal 1: Start server
|
||||
./run.sh
|
||||
|
||||
# Terminal 2: Test API
|
||||
curl -X POST http://localhost:8080/generate-sitemap-xml \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url":"https://example.com","max_depth":2}'
|
||||
|
||||
# Terminal 3: Watch SSE stream
|
||||
curl -N http://localhost:8080/stream/{uuid}
|
||||
```
|
||||
|
||||
### Browser Testing
|
||||
1. Open multiple tabs to http://localhost:8080
|
||||
2. Start different crawls simultaneously
|
||||
3. Verify independent progress tracking
|
||||
4. Check database for metadata
|
||||
|
||||
### Database Verification
|
||||
```bash
|
||||
sqlite3 sitemap.db "SELECT * FROM sites ORDER BY created_at DESC LIMIT 5;"
|
||||
sqlite3 sitemap.db "SELECT COUNT(*) FROM pages WHERE site_id = 1;"
|
||||
```
|
||||
|
||||
## 📦 Deployment Options
|
||||
|
||||
### Option 1: Binary
|
||||
```bash
|
||||
go build -o sitemap-api
|
||||
./sitemap-api
|
||||
```
|
||||
|
||||
### Option 2: Docker
|
||||
```bash
|
||||
docker build -t sitemap-api .
|
||||
docker run -p 8080:8080 sitemap-api
|
||||
```
|
||||
|
||||
### Option 3: Systemd Service
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Sitemap Generator API
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=www-data
|
||||
WorkingDirectory=/opt/sitemap-api
|
||||
ExecStart=/opt/sitemap-api/sitemap-api
|
||||
Restart=always
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
## 🔧 Configuration
|
||||
|
||||
### Environment Variables
|
||||
```bash
|
||||
export PORT=8080 # Server port
|
||||
export DB_PATH=sitemap.db # Database file
|
||||
```
|
||||
|
||||
### Code Constants
|
||||
```go
|
||||
// crawler/crawler.go
|
||||
const maxConcurrent = 5 // Parallel requests
|
||||
const httpTimeout = 10 // Seconds
|
||||
|
||||
// handlers/handler.go
|
||||
const channelBuffer = 100 // SSE event buffer
|
||||
```
|
||||
|
||||
## 📝 XML Sitemap Format
|
||||
|
||||
Generated sitemaps follow the standard:
|
||||
```xml
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>https://example.com/</loc>
|
||||
<lastmod>2024-02-05</lastmod>
|
||||
<changefreq>monthly</changefreq>
|
||||
<priority>1.0</priority>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://example.com/about</loc>
|
||||
<lastmod>2024-02-05</lastmod>
|
||||
<changefreq>monthly</changefreq>
|
||||
<priority>0.8</priority>
|
||||
</url>
|
||||
</urlset>
|
||||
```
|
||||
|
||||
## 🎯 Success Criteria
|
||||
|
||||
All requirements met:
|
||||
- ✅ Go backend with excellent performance
|
||||
- ✅ Endpoint: `/generate-sitemap-xml` with UUID response
|
||||
- ✅ Endpoint: `/stream/{uuid}` for SSE
|
||||
- ✅ Endpoint: `/download/{uuid}` for XML
|
||||
- ✅ Multi-user concurrent support
|
||||
- ✅ Client metadata tracking (IP, browser, cookies, session)
|
||||
- ✅ SQLite storage
|
||||
- ✅ Root route `/` serves HTML
|
||||
- ✅ Real-time progress updates
|
||||
- ✅ Clean, maintainable code structure
|
||||
|
||||
## 📚 Next Steps
|
||||
|
||||
To extend this project:
|
||||
1. Add user authentication (JWT tokens)
|
||||
2. Implement rate limiting (go-rate package)
|
||||
3. Add robots.txt parsing (robotstxt.go package)
|
||||
4. Support sitemap index for large sites
|
||||
5. Add scheduling/cron jobs for recurring crawls
|
||||
6. Implement incremental updates
|
||||
7. Add webhook notifications
|
||||
8. Create admin dashboard
|
||||
9. Export to other formats (JSON, CSV)
|
||||
10. Add analytics and usage stats
|
||||
|
||||
---
|
||||
|
||||
**Ready to use! Just run `./run.sh` or `make run` to get started.**
|
||||
152
QUICKSTART.md
Normal file
152
QUICKSTART.md
Normal file
@@ -0,0 +1,152 @@
|
||||
# 🚀 Quick Start Guide
|
||||
|
||||
Get your sitemap generator running in 3 steps!
|
||||
|
||||
## Step 1: Install Go
|
||||
|
||||
If you don't have Go installed:
|
||||
- Download from https://golang.org/dl/
|
||||
- Install Go 1.21 or later
|
||||
- Verify: `go version`
|
||||
|
||||
## Step 2: Run the Application
|
||||
|
||||
### Option A: Using the run script (easiest)
|
||||
```bash
|
||||
cd sitemap-api
|
||||
./run.sh
|
||||
```
|
||||
|
||||
### Option B: Using Make
|
||||
```bash
|
||||
cd sitemap-api
|
||||
make run
|
||||
```
|
||||
|
||||
### Option C: Manual
|
||||
```bash
|
||||
cd sitemap-api
|
||||
go mod download
|
||||
go build -o sitemap-api .
|
||||
./sitemap-api
|
||||
```
|
||||
|
||||
## Step 3: Use the Application
|
||||
|
||||
1. **Open your browser** → http://localhost:8080
|
||||
|
||||
2. **Enter a URL** → e.g., `https://example.com`
|
||||
|
||||
3. **Set crawl depth** → 1-5 (default: 3)
|
||||
|
||||
4. **Click "Generate Sitemap"** → Watch real-time progress!
|
||||
|
||||
5. **Download XML** → Click the download button when complete
|
||||
|
||||
## Testing Multiple Users
|
||||
|
||||
Open multiple browser tabs to http://localhost:8080 and start different crawls simultaneously. Each will have its own UUID and progress stream!
|
||||
|
||||
## API Usage Examples
|
||||
|
||||
### Start a crawl
|
||||
```bash
|
||||
curl -X POST http://localhost:8080/generate-sitemap-xml \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url": "https://example.com", "max_depth": 3}'
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"uuid": "550e8400-e29b-41d4-a716-446655440000",
|
||||
"site_id": 123,
|
||||
"status": "processing",
|
||||
"stream_url": "/stream/550e8400-e29b-41d4-a716-446655440000",
|
||||
"message": "Sitemap generation started"
|
||||
}
|
||||
```
|
||||
|
||||
### Monitor progress (SSE)
|
||||
```bash
|
||||
curl http://localhost:8080/stream/550e8400-e29b-41d4-a716-446655440000
|
||||
```
|
||||
|
||||
### Download sitemap
|
||||
```bash
|
||||
curl http://localhost:8080/download/550e8400-e29b-41d4-a716-446655440000 -o sitemap.xml
|
||||
```
|
||||
|
||||
### List all sitemaps
|
||||
```bash
|
||||
curl http://localhost:8080/sites
|
||||
```
|
||||
|
||||
### Delete a sitemap
|
||||
```bash
|
||||
curl -X DELETE http://localhost:8080/sites/123
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Port already in use
|
||||
```bash
|
||||
PORT=3000 ./sitemap-api
|
||||
```
|
||||
|
||||
### Build errors
|
||||
```bash
|
||||
go mod tidy
|
||||
go clean -cache
|
||||
go build -o sitemap-api .
|
||||
```
|
||||
|
||||
### Database locked
|
||||
```bash
|
||||
rm sitemap.db
|
||||
./sitemap-api
|
||||
```
|
||||
|
||||
### CGO errors
|
||||
Make sure you have gcc installed:
|
||||
- **Ubuntu/Debian**: `sudo apt-get install build-essential`
|
||||
- **macOS**: `xcode-select --install`
|
||||
- **Windows**: Install MinGW or TDM-GCC
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Read the full [README.md](README.md) for details
|
||||
- Customize the crawler in `crawler/crawler.go`
|
||||
- Add authentication to handlers
|
||||
- Deploy to production (see README for nginx config)
|
||||
- Add more metadata tracking
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
sitemap-api/
|
||||
├── main.go # Server entry point
|
||||
├── handlers/ # HTTP handlers & SSE
|
||||
├── crawler/ # Web crawler logic
|
||||
├── database/ # SQLite operations
|
||||
├── models/ # Data structures
|
||||
├── static/ # Frontend (served at /)
|
||||
├── README.md # Full documentation
|
||||
├── run.sh # Quick start script
|
||||
├── Makefile # Build commands
|
||||
└── Dockerfile # Container setup
|
||||
```
|
||||
|
||||
## Support
|
||||
|
||||
Having issues? Check:
|
||||
1. Go version >= 1.21
|
||||
2. Port 8080 is available
|
||||
3. SQLite3 is working
|
||||
4. All dependencies installed
|
||||
|
||||
Still stuck? Open an issue on GitHub!
|
||||
|
||||
---
|
||||
|
||||
**Built with ❤️ using Go + Goroutines + Server-Sent Events**
|
||||
213
README.md
Normal file
213
README.md
Normal file
@@ -0,0 +1,213 @@
|
||||
# XML Sitemap Generator API
|
||||
|
||||
A high-performance Go-based API for generating XML sitemaps with real-time progress tracking via Server-Sent Events (SSE).
|
||||
|
||||
## Features
|
||||
|
||||
- ✅ **Concurrent Web Crawling** - Fast sitemap generation using goroutines
|
||||
- ✅ **Real-time Progress** - SSE streaming for live updates
|
||||
- ✅ **Multi-user Support** - Handle multiple simultaneous crawls
|
||||
- ✅ **Client Metadata Tracking** - IP, browser, OS, session data stored in SQLite
|
||||
- ✅ **Clean REST API** - Simple endpoints for generate, stream, and download
|
||||
- ✅ **Professional UI** - Beautiful web interface included
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
sitemap-api/
|
||||
├── main.go # Entry point & HTTP server
|
||||
├── handlers/
|
||||
│ └── handler.go # HTTP handlers & SSE streaming
|
||||
├── crawler/
|
||||
│ └── crawler.go # Concurrent web crawler
|
||||
├── database/
|
||||
│ └── db.go # SQLite operations
|
||||
├── models/
|
||||
│ └── site.go # Data structures
|
||||
└── static/
|
||||
└── index.html # Frontend UI
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### `POST /generate-sitemap-xml`
|
||||
Start sitemap generation (backend generates UUID)
|
||||
|
||||
**Request:**
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"max_depth": 3
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"uuid": "550e8400-e29b-41d4-a716-446655440000",
|
||||
"site_id": 123,
|
||||
"status": "processing",
|
||||
"stream_url": "/stream/550e8400-...",
|
||||
"message": "Sitemap generation started"
|
||||
}
|
||||
```
|
||||
|
||||
### `GET /stream/{uuid}`
|
||||
Server-Sent Events stream for real-time progress
|
||||
|
||||
**Events:** `connected`, `started`, `progress`, `complete`, `error`
|
||||
|
||||
### `GET /download/{uuid}`
|
||||
Download generated sitemap XML
|
||||
|
||||
### `GET /sites`
|
||||
List all generated sitemaps
|
||||
|
||||
### `GET /sites/{id}`
|
||||
Get specific site details
|
||||
|
||||
### `DELETE /sites/{id}`
|
||||
Delete a sitemap
|
||||
|
||||
### `GET /health`
|
||||
Health check endpoint
|
||||
|
||||
## Installation
|
||||
|
||||
### Prerequisites
|
||||
- Go 1.21+
|
||||
- SQLite3
|
||||
|
||||
### Setup
|
||||
|
||||
```bash
|
||||
# Clone/navigate to directory
|
||||
cd sitemap-api
|
||||
|
||||
# Install dependencies
|
||||
go mod download
|
||||
|
||||
# Build
|
||||
go build -o sitemap-api
|
||||
|
||||
# Run
|
||||
./sitemap-api
|
||||
```
|
||||
|
||||
Server starts on **http://localhost:8080**
|
||||
|
||||
### Or run directly:
|
||||
```bash
|
||||
go run main.go
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Open http://localhost:8080 in your browser
|
||||
2. Enter a website URL
|
||||
3. Set crawl depth (1-5)
|
||||
4. Click "Generate Sitemap"
|
||||
5. Watch real-time progress
|
||||
6. Download XML when complete
|
||||
|
||||
## Database Schema
|
||||
|
||||
SQLite database (`sitemap.db`) stores:
|
||||
- **sites** - Crawl sessions with client metadata
|
||||
- **pages** - Discovered URLs with priority/frequency
|
||||
- **sessions** - User session tracking
|
||||
|
||||
## Environment Variables
|
||||
|
||||
- `PORT` - Server port (default: 8080)
|
||||
|
||||
Example:
|
||||
```bash
|
||||
PORT=3000 ./sitemap-api
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Frontend** sends POST to `/generate-sitemap-xml`
|
||||
2. **Backend** generates UUID, saves metadata, returns UUID
|
||||
3. **Frontend** connects to `/stream/{uuid}` for SSE updates
|
||||
4. **Crawler** runs in goroutine, sends events via channel
|
||||
5. **Handler** streams events to frontend in real-time
|
||||
6. **On completion**, sitemap available at `/download/{uuid}`
|
||||
|
||||
## Multi-User Concurrency
|
||||
|
||||
The `StreamManager` handles concurrent users:
|
||||
- Each UUID maps to a Go channel
|
||||
- Concurrent map with mutex for thread safety
|
||||
- Automatic cleanup after crawl completion
|
||||
- Supports unlimited simultaneous crawls
|
||||
|
||||
## Client Metadata Captured
|
||||
|
||||
- IP Address (with X-Forwarded-For support)
|
||||
- User-Agent
|
||||
- Browser name & version
|
||||
- Operating System
|
||||
- Device Type (Desktop/Mobile/Tablet)
|
||||
- Session ID (cookie-based)
|
||||
- All cookies (JSON)
|
||||
- Referrer
|
||||
|
||||
## Performance
|
||||
|
||||
- Concurrent crawling with goroutines
|
||||
- Configurable concurrency limit (default: 5 parallel requests)
|
||||
- Depth-limited to prevent infinite crawls
|
||||
- Same-domain restriction
|
||||
- Duplicate URL prevention
|
||||
- 10-second HTTP timeout per request
|
||||
|
||||
## Customization
|
||||
|
||||
### Adjust Concurrency
|
||||
Edit `crawler/crawler.go`:
|
||||
```go
|
||||
semaphore := make(chan struct{}, 10) // Increase to 10 concurrent
|
||||
```
|
||||
|
||||
### Change Priority Calculation
|
||||
Modify `calculatePriority()` in `crawler/crawler.go`
|
||||
|
||||
### Add Custom Metadata
|
||||
Extend `models.Site` struct and database schema
|
||||
|
||||
## Production Deployment
|
||||
|
||||
### Recommendations:
|
||||
1. Use reverse proxy (nginx/caddy)
|
||||
2. Enable HTTPS
|
||||
3. Add rate limiting
|
||||
4. Configure CORS properly
|
||||
5. Use PostgreSQL for production (replace SQLite)
|
||||
6. Add authentication
|
||||
7. Implement cleanup jobs for old sitemaps
|
||||
|
||||
### Example nginx config:
|
||||
```nginx
|
||||
location / {
|
||||
proxy_pass http://localhost:8080;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection 'upgrade';
|
||||
proxy_set_header Host $host;
|
||||
proxy_cache_bypass $http_upgrade;
|
||||
|
||||
# SSE support
|
||||
proxy_buffering off;
|
||||
proxy_cache off;
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
|
||||
## Support
|
||||
|
||||
For issues or questions, please open a GitHub issue.
|
||||
287
crawler.go
Normal file
287
crawler.go
Normal file
@@ -0,0 +1,287 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sitemap-api/database"
|
||||
"sitemap-api/models"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
type Crawler struct {
|
||||
db *database.DB
|
||||
maxDepth int
|
||||
visited map[string]bool
|
||||
mu sync.Mutex
|
||||
baseURL *url.URL
|
||||
client *http.Client
|
||||
eventChan chan models.Event
|
||||
uuid string
|
||||
siteID int
|
||||
currentDepth int
|
||||
totalPages int
|
||||
}
|
||||
|
||||
func NewCrawler(db *database.DB) *Crawler {
|
||||
return &Crawler{
|
||||
db: db,
|
||||
client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Crawler) Crawl(uuid string, startURL string, maxDepth int, eventChan chan models.Event) {
|
||||
c.uuid = uuid
|
||||
c.maxDepth = maxDepth
|
||||
c.eventChan = eventChan
|
||||
c.visited = make(map[string]bool)
|
||||
c.totalPages = 0
|
||||
|
||||
// Parse base URL
|
||||
parsedURL, err := url.Parse(startURL)
|
||||
if err != nil {
|
||||
c.sendEvent("error", models.ErrorData{
|
||||
UUID: uuid,
|
||||
Error: fmt.Sprintf("Invalid URL: %v", err),
|
||||
})
|
||||
return
|
||||
}
|
||||
c.baseURL = parsedURL
|
||||
|
||||
// Get site from database
|
||||
site, err := c.db.GetSiteByUUID(uuid)
|
||||
if err != nil {
|
||||
c.sendEvent("error", models.ErrorData{
|
||||
UUID: uuid,
|
||||
Error: fmt.Sprintf("Failed to get site: %v", err),
|
||||
})
|
||||
return
|
||||
}
|
||||
c.siteID = site.ID
|
||||
|
||||
// Send started event
|
||||
c.sendEvent("started", map[string]interface{}{
|
||||
"uuid": uuid,
|
||||
"url": startURL,
|
||||
"max_depth": maxDepth,
|
||||
})
|
||||
|
||||
// Start crawling from root
|
||||
c.crawlURL(startURL, 0)
|
||||
|
||||
// Mark as completed
|
||||
err = c.db.UpdateSiteStatus(uuid, "completed", c.totalPages)
|
||||
if err != nil {
|
||||
c.sendEvent("error", models.ErrorData{
|
||||
UUID: uuid,
|
||||
Error: fmt.Sprintf("Failed to update status: %v", err),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Send completion event
|
||||
c.sendEvent("complete", models.CompleteData{
|
||||
UUID: uuid,
|
||||
PagesFound: c.totalPages,
|
||||
SiteID: c.siteID,
|
||||
DownloadURL: fmt.Sprintf("/download/%s", uuid),
|
||||
})
|
||||
}
|
||||
|
||||
func (c *Crawler) crawlURL(urlStr string, depth int) {
|
||||
// Check depth limit
|
||||
if depth > c.maxDepth {
|
||||
return
|
||||
}
|
||||
|
||||
// Normalize URL
|
||||
normalizedURL := c.normalizeURL(urlStr)
|
||||
if normalizedURL == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// Check if already visited
|
||||
c.mu.Lock()
|
||||
if c.visited[normalizedURL] {
|
||||
c.mu.Unlock()
|
||||
return
|
||||
}
|
||||
c.visited[normalizedURL] = true
|
||||
c.totalPages++
|
||||
currentTotal := c.totalPages
|
||||
c.currentDepth = depth
|
||||
c.mu.Unlock()
|
||||
|
||||
// Send progress event
|
||||
c.sendEvent("progress", models.ProgressData{
|
||||
UUID: c.uuid,
|
||||
PagesFound: currentTotal,
|
||||
Depth: depth,
|
||||
CurrentURL: normalizedURL,
|
||||
})
|
||||
|
||||
// Save page to database
|
||||
priority := c.calculatePriority(depth)
|
||||
page := &models.Page{
|
||||
SiteID: c.siteID,
|
||||
URL: normalizedURL,
|
||||
Depth: depth,
|
||||
LastModified: time.Now(),
|
||||
Priority: priority,
|
||||
ChangeFreq: "monthly",
|
||||
}
|
||||
|
||||
if err := c.db.AddPage(page); err != nil {
|
||||
// Log error but continue crawling
|
||||
fmt.Printf("Failed to save page %s: %v\n", normalizedURL, err)
|
||||
}
|
||||
|
||||
// Fetch the page
|
||||
resp, err := c.client.Get(normalizedURL)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Only process HTML pages
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
if !strings.Contains(contentType, "text/html") {
|
||||
return
|
||||
}
|
||||
|
||||
// Parse HTML and extract links
|
||||
links := c.extractLinks(resp)
|
||||
|
||||
// Crawl found links concurrently (with limited concurrency)
|
||||
var wg sync.WaitGroup
|
||||
semaphore := make(chan struct{}, 5) // Limit to 5 concurrent requests
|
||||
|
||||
for _, link := range links {
|
||||
if depth+1 <= c.maxDepth {
|
||||
wg.Add(1)
|
||||
go func(l string) {
|
||||
defer wg.Done()
|
||||
semaphore <- struct{}{} // Acquire
|
||||
c.crawlURL(l, depth+1)
|
||||
<-semaphore // Release
|
||||
}(link)
|
||||
}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func (c *Crawler) extractLinks(resp *http.Response) []string {
|
||||
var links []string
|
||||
tokenizer := html.NewTokenizer(resp.Body)
|
||||
|
||||
for {
|
||||
tokenType := tokenizer.Next()
|
||||
if tokenType == html.ErrorToken {
|
||||
break
|
||||
}
|
||||
|
||||
if tokenType == html.StartTagToken {
|
||||
token := tokenizer.Token()
|
||||
if token.Data == "a" {
|
||||
for _, attr := range token.Attr {
|
||||
if attr.Key == "href" {
|
||||
link := c.resolveURL(attr.Val)
|
||||
if link != "" && c.isSameDomain(link) {
|
||||
links = append(links, link)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return links
|
||||
}
|
||||
|
||||
func (c *Crawler) resolveURL(href string) string {
|
||||
parsedURL, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Resolve relative URLs
|
||||
resolvedURL := c.baseURL.ResolveReference(parsedURL)
|
||||
return resolvedURL.String()
|
||||
}
|
||||
|
||||
func (c *Crawler) normalizeURL(urlStr string) string {
|
||||
parsedURL, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Remove fragment
|
||||
parsedURL.Fragment = ""
|
||||
|
||||
// Remove trailing slash for consistency
|
||||
parsedURL.Path = strings.TrimSuffix(parsedURL.Path, "/")
|
||||
if parsedURL.Path == "" {
|
||||
parsedURL.Path = "/"
|
||||
}
|
||||
|
||||
return parsedURL.String()
|
||||
}
|
||||
|
||||
func (c *Crawler) isSameDomain(urlStr string) bool {
|
||||
parsedURL, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check if same host
|
||||
if parsedURL.Host != c.baseURL.Host {
|
||||
return false
|
||||
}
|
||||
|
||||
// Skip common non-HTML files
|
||||
path := strings.ToLower(parsedURL.Path)
|
||||
skipExtensions := []string{".pdf", ".jpg", ".jpeg", ".png", ".gif", ".css", ".js", ".xml", ".zip", ".tar", ".gz"}
|
||||
for _, ext := range skipExtensions {
|
||||
if strings.HasSuffix(path, ext) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (c *Crawler) calculatePriority(depth int) float64 {
|
||||
// Homepage gets highest priority
|
||||
if depth == 0 {
|
||||
return 1.0
|
||||
}
|
||||
// Decrease priority with depth
|
||||
priority := 1.0 - (float64(depth) * 0.2)
|
||||
if priority < 0.3 {
|
||||
priority = 0.3
|
||||
}
|
||||
return priority
|
||||
}
|
||||
|
||||
func (c *Crawler) sendEvent(eventType string, data interface{}) {
|
||||
if c.eventChan != nil {
|
||||
select {
|
||||
case c.eventChan <- models.Event{Type: eventType, Data: data}:
|
||||
default:
|
||||
// Channel full or closed, skip event
|
||||
}
|
||||
}
|
||||
}
|
||||
253
db.go
Normal file
253
db.go
Normal file
@@ -0,0 +1,253 @@
|
||||
package database
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"sitemap-api/models"
|
||||
"time"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
type DB struct {
|
||||
conn *sql.DB
|
||||
}
|
||||
|
||||
func NewDB(dbPath string) (*DB, error) {
|
||||
conn, err := sql.Open("sqlite3", dbPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
db := &DB{conn: conn}
|
||||
if err := db.createTables(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return db, nil
|
||||
}
|
||||
|
||||
func (db *DB) Close() error {
|
||||
return db.conn.Close()
|
||||
}
|
||||
|
||||
func (db *DB) createTables() error {
|
||||
schema := `
|
||||
CREATE TABLE IF NOT EXISTS sites (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
uuid TEXT UNIQUE NOT NULL,
|
||||
domain TEXT NOT NULL,
|
||||
url TEXT NOT NULL,
|
||||
max_depth INTEGER DEFAULT 3,
|
||||
page_count INTEGER DEFAULT 0,
|
||||
status TEXT DEFAULT 'processing',
|
||||
|
||||
ip_address TEXT,
|
||||
user_agent TEXT,
|
||||
browser TEXT,
|
||||
browser_version TEXT,
|
||||
os TEXT,
|
||||
device_type TEXT,
|
||||
|
||||
session_id TEXT,
|
||||
cookies TEXT,
|
||||
referrer TEXT,
|
||||
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
completed_at DATETIME,
|
||||
last_crawled DATETIME
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS pages (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
site_id INTEGER NOT NULL,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
depth INTEGER DEFAULT 0,
|
||||
last_modified DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
priority REAL DEFAULT 0.5,
|
||||
change_freq TEXT DEFAULT 'monthly',
|
||||
FOREIGN KEY (site_id) REFERENCES sites(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sessions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
session_id TEXT UNIQUE NOT NULL,
|
||||
uuid TEXT,
|
||||
ip_address TEXT,
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
last_activity DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (uuid) REFERENCES sites(uuid)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_uuid ON sites(uuid);
|
||||
CREATE INDEX IF NOT EXISTS idx_site_pages ON pages(site_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_session_id ON sessions(session_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_status ON sites(status);
|
||||
`
|
||||
|
||||
_, err := db.conn.Exec(schema)
|
||||
return err
|
||||
}
|
||||
|
||||
func (db *DB) CreateSite(site *models.Site) (int, error) {
|
||||
query := `
|
||||
INSERT INTO sites (uuid, domain, url, max_depth, status, ip_address,
|
||||
user_agent, browser, browser_version, os, device_type, session_id,
|
||||
cookies, referrer, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
|
||||
result, err := db.conn.Exec(query,
|
||||
site.UUID, site.Domain, site.URL, site.MaxDepth, site.Status,
|
||||
site.IPAddress, site.UserAgent, site.Browser, site.BrowserVersion,
|
||||
site.OS, site.DeviceType, site.SessionID, site.Cookies, site.Referrer,
|
||||
time.Now(),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
id, err := result.LastInsertId()
|
||||
return int(id), err
|
||||
}
|
||||
|
||||
func (db *DB) GetSiteByUUID(uuid string) (*models.Site, error) {
|
||||
query := `
|
||||
SELECT id, uuid, domain, url, max_depth, page_count, status,
|
||||
ip_address, user_agent, browser, browser_version, os, device_type,
|
||||
session_id, cookies, referrer, created_at, completed_at, last_crawled
|
||||
FROM sites WHERE uuid = ?
|
||||
`
|
||||
|
||||
site := &models.Site{}
|
||||
err := db.conn.QueryRow(query, uuid).Scan(
|
||||
&site.ID, &site.UUID, &site.Domain, &site.URL, &site.MaxDepth,
|
||||
&site.PageCount, &site.Status, &site.IPAddress, &site.UserAgent,
|
||||
&site.Browser, &site.BrowserVersion, &site.OS, &site.DeviceType,
|
||||
&site.SessionID, &site.Cookies, &site.Referrer, &site.CreatedAt,
|
||||
&site.CompletedAt, &site.LastCrawled,
|
||||
)
|
||||
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, fmt.Errorf("site not found")
|
||||
}
|
||||
|
||||
return site, err
|
||||
}
|
||||
|
||||
func (db *DB) GetSiteByID(id int) (*models.Site, error) {
|
||||
query := `
|
||||
SELECT id, uuid, domain, url, max_depth, page_count, status,
|
||||
ip_address, user_agent, browser, browser_version, os, device_type,
|
||||
session_id, cookies, referrer, created_at, completed_at, last_crawled
|
||||
FROM sites WHERE id = ?
|
||||
`
|
||||
|
||||
site := &models.Site{}
|
||||
err := db.conn.QueryRow(query, id).Scan(
|
||||
&site.ID, &site.UUID, &site.Domain, &site.URL, &site.MaxDepth,
|
||||
&site.PageCount, &site.Status, &site.IPAddress, &site.UserAgent,
|
||||
&site.Browser, &site.BrowserVersion, &site.OS, &site.DeviceType,
|
||||
&site.SessionID, &site.Cookies, &site.Referrer, &site.CreatedAt,
|
||||
&site.CompletedAt, &site.LastCrawled,
|
||||
)
|
||||
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, fmt.Errorf("site not found")
|
||||
}
|
||||
|
||||
return site, err
|
||||
}
|
||||
|
||||
func (db *DB) GetAllSites() ([]*models.Site, error) {
|
||||
query := `
|
||||
SELECT id, uuid, domain, url, max_depth, page_count, status,
|
||||
ip_address, user_agent, browser, browser_version, os, device_type,
|
||||
session_id, cookies, referrer, created_at, completed_at, last_crawled
|
||||
FROM sites ORDER BY created_at DESC
|
||||
`
|
||||
|
||||
rows, err := db.conn.Query(query)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
sites := []*models.Site{}
|
||||
for rows.Next() {
|
||||
site := &models.Site{}
|
||||
err := rows.Scan(
|
||||
&site.ID, &site.UUID, &site.Domain, &site.URL, &site.MaxDepth,
|
||||
&site.PageCount, &site.Status, &site.IPAddress, &site.UserAgent,
|
||||
&site.Browser, &site.BrowserVersion, &site.OS, &site.DeviceType,
|
||||
&site.SessionID, &site.Cookies, &site.Referrer, &site.CreatedAt,
|
||||
&site.CompletedAt, &site.LastCrawled,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
sites = append(sites, site)
|
||||
}
|
||||
|
||||
return sites, nil
|
||||
}
|
||||
|
||||
func (db *DB) UpdateSiteStatus(uuid string, status string, pageCount int) error {
|
||||
query := `
|
||||
UPDATE sites
|
||||
SET status = ?, page_count = ?, completed_at = ?, last_crawled = ?
|
||||
WHERE uuid = ?
|
||||
`
|
||||
|
||||
now := time.Now()
|
||||
_, err := db.conn.Exec(query, status, pageCount, now, now, uuid)
|
||||
return err
|
||||
}
|
||||
|
||||
func (db *DB) DeleteSite(id int) error {
|
||||
// Pages will be deleted automatically due to CASCADE
|
||||
_, err := db.conn.Exec("DELETE FROM sites WHERE id = ?", id)
|
||||
return err
|
||||
}
|
||||
|
||||
func (db *DB) AddPage(page *models.Page) error {
|
||||
query := `
|
||||
INSERT OR IGNORE INTO pages (site_id, url, depth, last_modified, priority, change_freq)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
`
|
||||
|
||||
_, err := db.conn.Exec(query,
|
||||
page.SiteID, page.URL, page.Depth, page.LastModified,
|
||||
page.Priority, page.ChangeFreq,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func (db *DB) GetPagesBySiteID(siteID int) ([]*models.Page, error) {
|
||||
query := `
|
||||
SELECT id, site_id, url, depth, last_modified, priority, change_freq
|
||||
FROM pages WHERE site_id = ? ORDER BY depth, url
|
||||
`
|
||||
|
||||
rows, err := db.conn.Query(query, siteID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
pages := []*models.Page{}
|
||||
for rows.Next() {
|
||||
page := &models.Page{}
|
||||
err := rows.Scan(
|
||||
&page.ID, &page.SiteID, &page.URL, &page.Depth,
|
||||
&page.LastModified, &page.Priority, &page.ChangeFreq,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pages = append(pages, page)
|
||||
}
|
||||
|
||||
return pages, nil
|
||||
}
|
||||
11
go.mod
Normal file
11
go.mod
Normal file
@@ -0,0 +1,11 @@
|
||||
module sitemap-api
|
||||
|
||||
go 1.21
|
||||
|
||||
require (
|
||||
github.com/go-chi/chi/v5 v5.0.11
|
||||
github.com/go-chi/cors v1.2.1
|
||||
github.com/google/uuid v1.5.0
|
||||
github.com/mattn/go-sqlite3 v1.14.19
|
||||
golang.org/x/net v0.20.0
|
||||
)
|
||||
465
handler.go
Normal file
465
handler.go
Normal file
@@ -0,0 +1,465 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sitemap-api/crawler"
|
||||
"sitemap-api/database"
|
||||
"sitemap-api/models"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
type Handler struct {
|
||||
db *database.DB
|
||||
crawler *crawler.Crawler
|
||||
streamManager *StreamManager
|
||||
}
|
||||
|
||||
func NewHandler(db *database.DB, streamManager *StreamManager) *Handler {
|
||||
return &Handler{
|
||||
db: db,
|
||||
crawler: crawler.NewCrawler(db),
|
||||
streamManager: streamManager,
|
||||
}
|
||||
}
|
||||
|
||||
// StreamManager handles multiple concurrent SSE connections
|
||||
type StreamManager struct {
|
||||
mu sync.RWMutex
|
||||
streams map[string]chan models.Event
|
||||
}
|
||||
|
||||
func NewStreamManager() *StreamManager {
|
||||
return &StreamManager{
|
||||
streams: make(map[string]chan models.Event),
|
||||
}
|
||||
}
|
||||
|
||||
func (sm *StreamManager) CreateStream(uuid string) chan models.Event {
|
||||
sm.mu.Lock()
|
||||
defer sm.mu.Unlock()
|
||||
|
||||
ch := make(chan models.Event, 100)
|
||||
sm.streams[uuid] = ch
|
||||
return ch
|
||||
}
|
||||
|
||||
func (sm *StreamManager) GetStream(uuid string) (chan models.Event, bool) {
|
||||
sm.mu.RLock()
|
||||
defer sm.mu.RUnlock()
|
||||
|
||||
ch, exists := sm.streams[uuid]
|
||||
return ch, exists
|
||||
}
|
||||
|
||||
func (sm *StreamManager) CloseStream(uuid string) {
|
||||
sm.mu.Lock()
|
||||
defer sm.mu.Unlock()
|
||||
|
||||
if ch, exists := sm.streams[uuid]; exists {
|
||||
close(ch)
|
||||
delete(sm.streams, uuid)
|
||||
}
|
||||
}
|
||||
|
||||
// GenerateSitemapXML handles POST /generate-sitemap-xml
|
||||
func (h *Handler) GenerateSitemapXML(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
URL string `json:"url"`
|
||||
MaxDepth int `json:"max_depth"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
http.Error(w, "Invalid request body", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate URL
|
||||
if req.URL == "" {
|
||||
http.Error(w, "URL is required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
parsedURL, err := url.Parse(req.URL)
|
||||
if err != nil || parsedURL.Scheme == "" || parsedURL.Host == "" {
|
||||
http.Error(w, "Invalid URL format", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Set default max depth
|
||||
if req.MaxDepth <= 0 || req.MaxDepth > 5 {
|
||||
req.MaxDepth = 3
|
||||
}
|
||||
|
||||
// Generate UUID server-side
|
||||
generatedUUID := uuid.New().String()
|
||||
|
||||
// Extract client metadata
|
||||
ip := getClientIP(r)
|
||||
userAgent := r.Header.Get("User-Agent")
|
||||
browser, browserVersion := parseBrowser(userAgent)
|
||||
os := parseOS(userAgent)
|
||||
deviceType := parseDeviceType(userAgent)
|
||||
sessionID := getOrCreateSession(r)
|
||||
cookies := extractCookies(r)
|
||||
referrer := r.Header.Get("Referer")
|
||||
|
||||
// Extract domain from URL
|
||||
domain := parsedURL.Host
|
||||
|
||||
// Create site record
|
||||
site := &models.Site{
|
||||
UUID: generatedUUID,
|
||||
Domain: domain,
|
||||
URL: req.URL,
|
||||
MaxDepth: req.MaxDepth,
|
||||
Status: "processing",
|
||||
IPAddress: ip,
|
||||
UserAgent: userAgent,
|
||||
Browser: browser,
|
||||
BrowserVersion: browserVersion,
|
||||
OS: os,
|
||||
DeviceType: deviceType,
|
||||
SessionID: sessionID,
|
||||
Cookies: cookies,
|
||||
Referrer: referrer,
|
||||
}
|
||||
|
||||
siteID, err := h.db.CreateSite(site)
|
||||
if err != nil {
|
||||
http.Error(w, fmt.Sprintf("Failed to create site: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Create SSE stream for this UUID
|
||||
eventChan := h.streamManager.CreateStream(generatedUUID)
|
||||
|
||||
// Start crawling in background (non-blocking)
|
||||
go func() {
|
||||
h.crawler.Crawl(generatedUUID, req.URL, req.MaxDepth, eventChan)
|
||||
// Close stream after crawl completes
|
||||
time.Sleep(2 * time.Second) // Give time for final events to be sent
|
||||
h.streamManager.CloseStream(generatedUUID)
|
||||
}()
|
||||
|
||||
// Return immediately with UUID
|
||||
response := map[string]interface{}{
|
||||
"uuid": generatedUUID,
|
||||
"site_id": siteID,
|
||||
"status": "processing",
|
||||
"stream_url": "/stream/" + generatedUUID,
|
||||
"message": "Sitemap generation started",
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(response)
|
||||
}
|
||||
|
||||
// StreamSSE handles GET /stream/{uuid}
|
||||
func (h *Handler) StreamSSE(w http.ResponseWriter, r *http.Request) {
|
||||
uuid := chi.URLParam(r, "uuid")
|
||||
|
||||
// Get event channel for this UUID
|
||||
eventChan, exists := h.streamManager.GetStream(uuid)
|
||||
if !exists {
|
||||
http.Error(w, "Stream not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
// Set SSE headers
|
||||
w.Header().Set("Content-Type", "text/event-stream")
|
||||
w.Header().Set("Cache-Control", "no-cache")
|
||||
w.Header().Set("Connection", "keep-alive")
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
|
||||
flusher, ok := w.(http.Flusher)
|
||||
if !ok {
|
||||
http.Error(w, "Streaming unsupported", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Send connected event
|
||||
connectedData := map[string]string{
|
||||
"uuid": uuid,
|
||||
"message": "Connected to stream",
|
||||
}
|
||||
connectedJSON, _ := json.Marshal(connectedData)
|
||||
fmt.Fprintf(w, "event: connected\ndata: %s\n\n", connectedJSON)
|
||||
flusher.Flush()
|
||||
|
||||
// Stream events
|
||||
for event := range eventChan {
|
||||
data, err := json.Marshal(event.Data)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(w, "event: %s\ndata: %s\n\n", event.Type, data)
|
||||
flusher.Flush()
|
||||
}
|
||||
}
|
||||
|
||||
// DownloadSitemap handles GET /download/{uuid}
|
||||
func (h *Handler) DownloadSitemap(w http.ResponseWriter, r *http.Request) {
|
||||
uuidParam := chi.URLParam(r, "uuid")
|
||||
|
||||
// Get site by UUID
|
||||
site, err := h.db.GetSiteByUUID(uuidParam)
|
||||
if err != nil {
|
||||
http.Error(w, "Sitemap not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
// Get all pages for this site
|
||||
pages, err := h.db.GetPagesBySiteID(site.ID)
|
||||
if err != nil {
|
||||
http.Error(w, "Failed to retrieve pages", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// Generate XML sitemap
|
||||
sitemap := generateXMLSitemap(pages)
|
||||
|
||||
// Set headers
|
||||
filename := fmt.Sprintf("sitemap-%s.xml", strings.ReplaceAll(site.Domain, ".", "-"))
|
||||
w.Header().Set("Content-Type", "application/xml; charset=utf-8")
|
||||
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", filename))
|
||||
w.Header().Set("X-Generated-At", time.Now().Format(time.RFC3339))
|
||||
|
||||
// Write XML
|
||||
w.Write([]byte(xml.Header))
|
||||
w.Write([]byte(sitemap))
|
||||
}
|
||||
|
||||
// GetSites handles GET /sites
|
||||
func (h *Handler) GetSites(w http.ResponseWriter, r *http.Request) {
|
||||
sites, err := h.db.GetAllSites()
|
||||
if err != nil {
|
||||
http.Error(w, "Failed to retrieve sites", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(sites)
|
||||
}
|
||||
|
||||
// GetSite handles GET /sites/{id}
|
||||
func (h *Handler) GetSite(w http.ResponseWriter, r *http.Request) {
|
||||
idParam := chi.URLParam(r, "id")
|
||||
id, err := strconv.Atoi(idParam)
|
||||
if err != nil {
|
||||
http.Error(w, "Invalid site ID", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
site, err := h.db.GetSiteByID(id)
|
||||
if err != nil {
|
||||
http.Error(w, "Site not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(site)
|
||||
}
|
||||
|
||||
// DeleteSite handles DELETE /sites/{id}
|
||||
func (h *Handler) DeleteSite(w http.ResponseWriter, r *http.Request) {
|
||||
idParam := chi.URLParam(r, "id")
|
||||
id, err := strconv.Atoi(idParam)
|
||||
if err != nil {
|
||||
http.Error(w, "Invalid site ID", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.db.DeleteSite(id); err != nil {
|
||||
http.Error(w, "Failed to delete site", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
"success": true,
|
||||
"message": "Site deleted successfully",
|
||||
})
|
||||
}
|
||||
|
||||
// Health handles GET /health
|
||||
func (h *Handler) Health(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]string{
|
||||
"status": "healthy",
|
||||
"time": time.Now().Format(time.RFC3339),
|
||||
})
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
|
||||
func getClientIP(r *http.Request) string {
|
||||
// Check X-Forwarded-For header first
|
||||
forwarded := r.Header.Get("X-Forwarded-For")
|
||||
if forwarded != "" {
|
||||
// Get first IP if multiple
|
||||
ips := strings.Split(forwarded, ",")
|
||||
return strings.TrimSpace(ips[0])
|
||||
}
|
||||
|
||||
// Check X-Real-IP header
|
||||
realIP := r.Header.Get("X-Real-IP")
|
||||
if realIP != "" {
|
||||
return realIP
|
||||
}
|
||||
|
||||
// Fallback to RemoteAddr
|
||||
ip := r.RemoteAddr
|
||||
if strings.Contains(ip, ":") {
|
||||
ip = strings.Split(ip, ":")[0]
|
||||
}
|
||||
return ip
|
||||
}
|
||||
|
||||
func parseBrowser(userAgent string) (string, string) {
|
||||
ua := strings.ToLower(userAgent)
|
||||
|
||||
browsers := map[string]string{
|
||||
"edg": "Edge",
|
||||
"chrome": "Chrome",
|
||||
"firefox": "Firefox",
|
||||
"safari": "Safari",
|
||||
"opera": "Opera",
|
||||
}
|
||||
|
||||
for key, name := range browsers {
|
||||
if strings.Contains(ua, key) {
|
||||
// Extract version
|
||||
version := extractVersion(ua, key)
|
||||
return name, version
|
||||
}
|
||||
}
|
||||
|
||||
return "Unknown", ""
|
||||
}
|
||||
|
||||
func extractVersion(ua, browser string) string {
|
||||
idx := strings.Index(ua, browser)
|
||||
if idx == -1 {
|
||||
return ""
|
||||
}
|
||||
|
||||
versionStart := idx + len(browser)
|
||||
if versionStart >= len(ua) {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Skip forward to version number
|
||||
for versionStart < len(ua) && (ua[versionStart] == '/' || ua[versionStart] == ' ') {
|
||||
versionStart++
|
||||
}
|
||||
|
||||
versionEnd := versionStart
|
||||
for versionEnd < len(ua) && (ua[versionEnd] >= '0' && ua[versionEnd] <= '9' || ua[versionEnd] == '.') {
|
||||
versionEnd++
|
||||
}
|
||||
|
||||
return ua[versionStart:versionEnd]
|
||||
}
|
||||
|
||||
func parseOS(userAgent string) string {
|
||||
ua := strings.ToLower(userAgent)
|
||||
|
||||
oses := []struct {
|
||||
keyword string
|
||||
name string
|
||||
}{
|
||||
{"windows nt 10", "Windows 10"},
|
||||
{"windows nt 11", "Windows 11"},
|
||||
{"mac os x", "macOS"},
|
||||
{"android", "Android"},
|
||||
{"iphone", "iOS"},
|
||||
{"ipad", "iOS"},
|
||||
{"linux", "Linux"},
|
||||
}
|
||||
|
||||
for _, os := range oses {
|
||||
if strings.Contains(ua, os.keyword) {
|
||||
return os.name
|
||||
}
|
||||
}
|
||||
|
||||
return "Unknown"
|
||||
}
|
||||
|
||||
func parseDeviceType(userAgent string) string {
|
||||
ua := strings.ToLower(userAgent)
|
||||
|
||||
if strings.Contains(ua, "mobile") || strings.Contains(ua, "android") || strings.Contains(ua, "iphone") {
|
||||
return "Mobile"
|
||||
}
|
||||
|
||||
if strings.Contains(ua, "tablet") || strings.Contains(ua, "ipad") {
|
||||
return "Tablet"
|
||||
}
|
||||
|
||||
return "Desktop"
|
||||
}
|
||||
|
||||
func getOrCreateSession(r *http.Request) string {
|
||||
// Try to get existing session from cookie
|
||||
cookie, err := r.Cookie("session_id")
|
||||
if err == nil && cookie.Value != "" {
|
||||
return cookie.Value
|
||||
}
|
||||
|
||||
// Generate new session ID
|
||||
return uuid.New().String()
|
||||
}
|
||||
|
||||
func extractCookies(r *http.Request) string {
|
||||
cookies := r.Cookies()
|
||||
if len(cookies) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
cookieData := make(map[string]string)
|
||||
for _, cookie := range cookies {
|
||||
cookieData[cookie.Name] = cookie.Value
|
||||
}
|
||||
|
||||
data, _ := json.Marshal(cookieData)
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func generateXMLSitemap(pages []*models.Page) string {
|
||||
var sb strings.Builder
|
||||
|
||||
sb.WriteString("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n")
|
||||
|
||||
for _, page := range pages {
|
||||
sb.WriteString(" <url>\n")
|
||||
sb.WriteString(fmt.Sprintf(" <loc>%s</loc>\n", xmlEscape(page.URL)))
|
||||
sb.WriteString(fmt.Sprintf(" <lastmod>%s</lastmod>\n", page.LastModified.Format("2006-01-02")))
|
||||
sb.WriteString(fmt.Sprintf(" <changefreq>%s</changefreq>\n", page.ChangeFreq))
|
||||
sb.WriteString(fmt.Sprintf(" <priority>%.1f</priority>\n", page.Priority))
|
||||
sb.WriteString(" </url>\n")
|
||||
}
|
||||
|
||||
sb.WriteString("</urlset>")
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
func xmlEscape(s string) string {
|
||||
s = strings.ReplaceAll(s, "&", "&")
|
||||
s = strings.ReplaceAll(s, "<", "<")
|
||||
s = strings.ReplaceAll(s, ">", ">")
|
||||
s = strings.ReplaceAll(s, "\"", """)
|
||||
s = strings.ReplaceAll(s, "'", "'")
|
||||
return s
|
||||
}
|
||||
726
index.html
Normal file
726
index.html
Normal file
@@ -0,0 +1,726 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Sitemap Generator</title>
|
||||
<style>
|
||||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
min-height: 100vh;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
box-shadow: 0 20px 40px rgba(0,0,0,0.1);
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.header {
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
padding: 30px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.header h1 {
|
||||
font-size: 2.5rem;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.header p {
|
||||
opacity: 0.9;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
.main {
|
||||
padding: 40px;
|
||||
}
|
||||
|
||||
.form-section {
|
||||
background: #f8f9fa;
|
||||
padding: 30px;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.form-group {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
label {
|
||||
display: block;
|
||||
margin-bottom: 8px;
|
||||
font-weight: 600;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
input[type="text"], input[type="number"] {
|
||||
width: 100%;
|
||||
padding: 12px;
|
||||
border: 2px solid #e9ecef;
|
||||
border-radius: 6px;
|
||||
font-size: 1rem;
|
||||
transition: border-color 0.3s;
|
||||
}
|
||||
|
||||
input[type="text"]:focus, input[type="number"]:focus {
|
||||
outline: none;
|
||||
border-color: #667eea;
|
||||
}
|
||||
|
||||
.btn {
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 12px 30px;
|
||||
border-radius: 6px;
|
||||
font-size: 1rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: transform 0.2s, box-shadow 0.2s;
|
||||
}
|
||||
|
||||
.btn:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 10px 20px rgba(102, 126, 234, 0.3);
|
||||
}
|
||||
|
||||
.btn:disabled {
|
||||
background: #6c757d;
|
||||
cursor: not-allowed;
|
||||
transform: none;
|
||||
box-shadow: none;
|
||||
}
|
||||
|
||||
.btn-secondary {
|
||||
background: #28a745;
|
||||
margin-left: 10px;
|
||||
}
|
||||
|
||||
.btn-danger {
|
||||
background: #dc3545;
|
||||
margin-left: 10px;
|
||||
}
|
||||
|
||||
.progress-section {
|
||||
display: none;
|
||||
background: #f8f9fa;
|
||||
padding: 30px;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.progress-bar {
|
||||
width: 100%;
|
||||
height: 20px;
|
||||
background: #e9ecef;
|
||||
border-radius: 10px;
|
||||
overflow: hidden;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.progress-fill {
|
||||
height: 100%;
|
||||
background: linear-gradient(90deg, #667eea, #764ba2);
|
||||
width: 0%;
|
||||
transition: width 0.3s ease;
|
||||
border-radius: 10px;
|
||||
}
|
||||
|
||||
.status {
|
||||
padding: 15px;
|
||||
border-radius: 6px;
|
||||
margin-bottom: 20px;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.status.info {
|
||||
background: #d1ecf1;
|
||||
color: rgb(12, 84, 96);
|
||||
border: 1px solid #bee5eb;
|
||||
}
|
||||
|
||||
.status.success {
|
||||
background: #d4edda;
|
||||
color: rgb(21, 87, 36);
|
||||
border: 1px solid #c3e6cb;
|
||||
}
|
||||
|
||||
.status.error {
|
||||
background: #f8d7da;
|
||||
color: rgb(114, 28, 36);
|
||||
border: 1px solid #f5c6cb;
|
||||
}
|
||||
|
||||
.log-section {
|
||||
background: #2d3748;
|
||||
color: #e2e8f0;
|
||||
padding: 20px;
|
||||
border-radius: 8px;
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 0.9rem;
|
||||
max-height: 400px;
|
||||
overflow-y: auto;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.log-entry {
|
||||
margin-bottom: 10px;
|
||||
padding: 8px;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.log-entry.start {
|
||||
background: rgba(102, 126, 234, 0.2);
|
||||
border-left: 3px solid #667eea;
|
||||
}
|
||||
|
||||
.log-entry.progress {
|
||||
background: rgba(40, 167, 69, 0.1);
|
||||
border-left: 3px solid #28a745;
|
||||
}
|
||||
|
||||
.log-entry.complete {
|
||||
background: rgba(25, 135, 84, 0.1);
|
||||
border-left: 3px solid #198754;
|
||||
}
|
||||
|
||||
.log-entry.error {
|
||||
background: rgba(220, 53, 69, 0.1);
|
||||
border-left: 3px solid #dc3545;
|
||||
}
|
||||
|
||||
.results-section {
|
||||
display: none;
|
||||
background: #f8f9fa;
|
||||
padding: 30px;
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
.stats-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
||||
gap: 20px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.stat-card {
|
||||
background: white;
|
||||
padding: 20px;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.stat-number {
|
||||
font-size: 2rem;
|
||||
font-weight: bold;
|
||||
color: #667eea;
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
color: #6c757d;
|
||||
margin-top: 5px;
|
||||
}
|
||||
|
||||
.sites-list {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.site-item {
|
||||
padding: 20px;
|
||||
border-bottom: 1px solid #e9ecef;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.site-item:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
.site-domain {
|
||||
font-weight: 600;
|
||||
color: #333;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.site-meta {
|
||||
color: #6c757d;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
.connection-status {
|
||||
display: inline-block;
|
||||
padding: 5px 10px;
|
||||
border-radius: 4px;
|
||||
font-size: 0.9rem;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.connection-status.connected {
|
||||
background: #d4edda;
|
||||
color: #155724;
|
||||
}
|
||||
|
||||
.connection-status.disconnected {
|
||||
background: #f8d7da;
|
||||
color: #721c24;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1>🗺️ XML Sitemap Generator</h1>
|
||||
<p>Generate sitemaps for your websites with real-time progress tracking</p>
|
||||
</div>
|
||||
|
||||
<div class="main">
|
||||
<!-- Input Form -->
|
||||
<div class="form-section">
|
||||
<div class="form-group">
|
||||
<label for="siteUrl">Website URL</label>
|
||||
<input type="text" id="siteUrl" placeholder="https://example.com" value="https://example.com">
|
||||
</div>
|
||||
|
||||
<div class="form-group">
|
||||
<label for="maxDepth">Max Crawl Depth (1-5)</label>
|
||||
<input type="number" id="maxDepth" min="1" max="5" value="3">
|
||||
</div>
|
||||
|
||||
<button class="btn" id="generateBtn" onclick="sitemapGen.generateSitemap()">
|
||||
🚀 Generate Sitemap
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Progress Section -->
|
||||
<div class="progress-section" id="progressSection">
|
||||
<div id="connectionStatus" class="connection-status disconnected">🔴 Disconnected</div>
|
||||
|
||||
<div class="progress-bar">
|
||||
<div class="progress-fill" id="progressFill"></div>
|
||||
</div>
|
||||
|
||||
<div class="status info" id="statusMessage">
|
||||
Initializing...
|
||||
</div>
|
||||
|
||||
<div class="stats-grid">
|
||||
<div class="stat-card">
|
||||
<div class="stat-number" id="totalPages">0</div>
|
||||
<div class="stat-label">Pages Found</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-number" id="currentDepth">0</div>
|
||||
<div class="stat-label">Current Depth</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-number" id="crawlTime">0s</div>
|
||||
<div class="stat-label">Crawl Time</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="currentUrl" style="margin-bottom: 20px; font-size: 0.9rem; color: #6c757d;">
|
||||
Current: -
|
||||
</div>
|
||||
|
||||
<button class="btn btn-secondary" id="downloadBtn" onclick="sitemapGen.downloadSitemap()" style="display: none;">
|
||||
📥 Download Sitemap
|
||||
</button>
|
||||
<button class="btn btn-danger" onclick="sitemapGen.clearAll()">
|
||||
🗑️ Clear All
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Log Section -->
|
||||
<div class="log-section" id="logSection" style="display: none;">
|
||||
<div id="logContainer"></div>
|
||||
</div>
|
||||
|
||||
<!-- Results Section -->
|
||||
<div class="results-section" id="resultsSection">
|
||||
<h3 style="margin-bottom: 20px;">Previously Generated Sitemaps</h3>
|
||||
<div class="sites-list" id="sitesList"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
class SitemapGenerator {
|
||||
constructor() {
|
||||
this.currentUUID = null;
|
||||
this.currentSiteId = null;
|
||||
this.eventSource = null;
|
||||
this.startTime = null;
|
||||
this.crawlTimer = null;
|
||||
this.totalPages = 0;
|
||||
|
||||
// Load existing sites on init
|
||||
this.loadExistingSites();
|
||||
}
|
||||
|
||||
async generateSitemap() {
|
||||
let url = document.getElementById('siteUrl').value.trim();
|
||||
const maxDepth = parseInt(document.getElementById('maxDepth').value);
|
||||
|
||||
if (!url) {
|
||||
alert('Please enter a website URL');
|
||||
return;
|
||||
}
|
||||
|
||||
// Auto-add protocol if missing
|
||||
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||
url = 'https://' + url;
|
||||
document.getElementById('siteUrl').value = url;
|
||||
}
|
||||
|
||||
try {
|
||||
this.disableForm();
|
||||
this.showProgress();
|
||||
this.addLog(`Starting crawl of ${url}`, 'start');
|
||||
|
||||
const response = await fetch('/generate-sitemap-xml', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
credentials: 'include',
|
||||
body: JSON.stringify({
|
||||
url: url,
|
||||
max_depth: maxDepth
|
||||
})
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
this.currentUUID = result.uuid;
|
||||
this.currentSiteId = result.site_id;
|
||||
|
||||
this.addLog(`UUID assigned: ${result.uuid}`, 'start');
|
||||
this.addLog(`${result.message}`, 'start');
|
||||
|
||||
// Connect to stream with UUID
|
||||
this.connectToStream(result.uuid);
|
||||
|
||||
} catch (error) {
|
||||
this.addLog(`Failed to start crawl: ${error.message}`, 'error');
|
||||
this.showError(error.message);
|
||||
this.stopCrawlTimer();
|
||||
this.enableForm();
|
||||
}
|
||||
}
|
||||
|
||||
connectToStream(uuid) {
|
||||
if (this.eventSource) {
|
||||
this.eventSource.close();
|
||||
}
|
||||
|
||||
this.addLog(`Connecting to stream: /stream/${uuid}`, 'start');
|
||||
|
||||
try {
|
||||
this.eventSource = new EventSource(`/stream/${uuid}`);
|
||||
|
||||
this.eventSource.addEventListener('connected', (e) => {
|
||||
const data = JSON.parse(e.data);
|
||||
this.addLog(`Connected to stream`, 'progress');
|
||||
this.updateConnectionStatus(true);
|
||||
document.getElementById('statusMessage').textContent = '🔄 Crawling...';
|
||||
});
|
||||
|
||||
this.eventSource.addEventListener('started', (e) => {
|
||||
const data = JSON.parse(e.data);
|
||||
this.startTime = Date.now();
|
||||
this.startCrawlTimer();
|
||||
this.addLog(`Crawl started: ${data.url}`, 'start');
|
||||
});
|
||||
|
||||
this.eventSource.addEventListener('progress', (e) => {
|
||||
const data = JSON.parse(e.data);
|
||||
this.updateProgress(data);
|
||||
if (data.current_url) {
|
||||
this.addLog(`Depth ${data.depth}: ${data.current_url}`, 'progress');
|
||||
}
|
||||
});
|
||||
|
||||
this.eventSource.addEventListener('complete', (e) => {
|
||||
const data = JSON.parse(e.data);
|
||||
this.completeGeneration(data);
|
||||
this.addLog(`✅ Complete! Found ${data.pages_found} pages`, 'complete');
|
||||
this.showDownloadButton();
|
||||
this.eventSource.close();
|
||||
this.stopCrawlTimer();
|
||||
this.loadExistingSites();
|
||||
});
|
||||
|
||||
this.eventSource.addEventListener('error', (e) => {
|
||||
if (e.data) {
|
||||
try {
|
||||
const data = JSON.parse(e.data);
|
||||
this.showError(data.error);
|
||||
this.addLog(`❌ Error: ${data.error}`, 'error');
|
||||
} catch (err) {
|
||||
this.addLog('Stream error occurred', 'error');
|
||||
}
|
||||
}
|
||||
this.eventSource.close();
|
||||
this.updateConnectionStatus(false);
|
||||
this.enableForm();
|
||||
this.stopCrawlTimer();
|
||||
});
|
||||
|
||||
this.eventSource.onerror = (e) => {
|
||||
this.updateConnectionStatus(false);
|
||||
this.addLog('Stream connection lost', 'error');
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
this.addLog(`Failed to connect to stream: ${error.message}`, 'error');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
startCrawlTimer() {
|
||||
this.stopCrawlTimer();
|
||||
|
||||
this.crawlTimer = setInterval(() => {
|
||||
if (this.startTime) {
|
||||
const elapsed = Math.floor((Date.now() - this.startTime) / 1000);
|
||||
const element = document.getElementById('crawlTime');
|
||||
if (element) {
|
||||
element.textContent = `${elapsed}s`;
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
}
|
||||
|
||||
stopCrawlTimer() {
|
||||
if (this.crawlTimer) {
|
||||
clearInterval(this.crawlTimer);
|
||||
this.crawlTimer = null;
|
||||
}
|
||||
}
|
||||
|
||||
updateProgress(data) {
|
||||
if (data.pages_found !== undefined) {
|
||||
this.totalPages = data.pages_found;
|
||||
document.getElementById('totalPages').textContent = data.pages_found;
|
||||
}
|
||||
|
||||
if (data.depth !== undefined) {
|
||||
document.getElementById('currentDepth').textContent = data.depth;
|
||||
}
|
||||
|
||||
if (data.current_url) {
|
||||
document.getElementById('currentUrl').textContent = `Current: ${data.current_url}`;
|
||||
}
|
||||
|
||||
// Update progress bar (estimated)
|
||||
const progress = Math.min((this.totalPages / 100) * 100, 90);
|
||||
document.getElementById('progressFill').style.width = `${progress}%`;
|
||||
}
|
||||
|
||||
completeGeneration(data) {
|
||||
this.enableForm();
|
||||
document.getElementById('progressFill').style.width = '100%';
|
||||
document.getElementById('statusMessage').className = 'status success';
|
||||
document.getElementById('statusMessage').textContent = `✅ Complete! Found ${data.pages_found || this.totalPages} pages`;
|
||||
|
||||
if (this.startTime) {
|
||||
const elapsed = Math.floor((Date.now() - this.startTime) / 1000);
|
||||
document.getElementById('crawlTime').textContent = `${elapsed}s`;
|
||||
}
|
||||
}
|
||||
|
||||
showDownloadButton() {
|
||||
document.getElementById('downloadBtn').style.display = 'inline-block';
|
||||
}
|
||||
|
||||
async downloadSitemap() {
|
||||
if (!this.currentUUID) {
|
||||
// Try to get the latest site
|
||||
await this.loadExistingSites();
|
||||
}
|
||||
|
||||
if (this.currentUUID) {
|
||||
window.open(`/download/${this.currentUUID}`, '_blank');
|
||||
} else {
|
||||
alert('No sitemap available for download');
|
||||
}
|
||||
}
|
||||
|
||||
async loadExistingSites() {
|
||||
try {
|
||||
const response = await fetch('/sites');
|
||||
if (!response.ok) return;
|
||||
|
||||
const sites = await response.json();
|
||||
this.displaySites(sites);
|
||||
|
||||
if (sites.length > 0) {
|
||||
this.currentUUID = sites[0].uuid;
|
||||
this.currentSiteId = sites[0].id;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Failed to load sites:', error);
|
||||
}
|
||||
}
|
||||
|
||||
displaySites(sites) {
|
||||
const container = document.getElementById('sitesList');
|
||||
container.innerHTML = '';
|
||||
|
||||
if (sites.length === 0) {
|
||||
container.innerHTML = '<p>No sitemaps generated yet.</p>';
|
||||
document.getElementById('resultsSection').style.display = 'none';
|
||||
return;
|
||||
}
|
||||
|
||||
sites.forEach(site => {
|
||||
const siteDiv = document.createElement('div');
|
||||
siteDiv.className = 'site-item';
|
||||
const createdDate = new Date(site.created_at).toLocaleString();
|
||||
siteDiv.innerHTML = `
|
||||
<div class="site-info">
|
||||
<div class="site-domain">${site.domain}</div>
|
||||
<div class="site-meta">
|
||||
${site.page_count} pages •
|
||||
Status: ${site.status} •
|
||||
Created: ${createdDate}
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
<button class="btn btn-secondary" onclick="sitemapGen.downloadSiteSitemap('${site.uuid}')">
|
||||
📥 Download
|
||||
</button>
|
||||
<button class="btn btn-danger" onclick="sitemapGen.deleteSite(${site.id})">
|
||||
🗑️ Delete
|
||||
</button>
|
||||
</div>
|
||||
`;
|
||||
container.appendChild(siteDiv);
|
||||
});
|
||||
|
||||
document.getElementById('resultsSection').style.display = 'block';
|
||||
}
|
||||
|
||||
async downloadSiteSitemap(uuid) {
|
||||
window.open(`/download/${uuid}`, '_blank');
|
||||
}
|
||||
|
||||
async deleteSite(siteId) {
|
||||
if (!confirm('Are you sure you want to delete this sitemap?')) return;
|
||||
|
||||
try {
|
||||
const response = await fetch(`/sites/${siteId}`, {
|
||||
method: 'DELETE'
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
this.addLog(`Site ${siteId} deleted`, 'complete');
|
||||
this.loadExistingSites();
|
||||
} else {
|
||||
throw new Error('Failed to delete site');
|
||||
}
|
||||
} catch (error) {
|
||||
this.addLog(`Delete failed: ${error.message}`, 'error');
|
||||
}
|
||||
}
|
||||
|
||||
async clearAll() {
|
||||
if (!confirm('Are you sure you want to clear all data? This cannot be undone.')) return;
|
||||
|
||||
try {
|
||||
this.stopCrawlTimer();
|
||||
|
||||
const sitesResponse = await fetch('/sites');
|
||||
if (sitesResponse.ok) {
|
||||
const sites = await sitesResponse.json();
|
||||
for (const site of sites) {
|
||||
await fetch(`/sites/${site.id}`, {
|
||||
method: 'DELETE'
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
this.addLog('All data cleared', 'complete');
|
||||
this.loadExistingSites();
|
||||
this.hideProgress();
|
||||
} catch (error) {
|
||||
this.addLog(`Clear failed: ${error.message}`, 'error');
|
||||
}
|
||||
}
|
||||
|
||||
showProgress() {
|
||||
document.getElementById('progressSection').style.display = 'block';
|
||||
document.getElementById('logSection').style.display = 'block';
|
||||
}
|
||||
|
||||
hideProgress() {
|
||||
document.getElementById('progressSection').style.display = 'none';
|
||||
}
|
||||
|
||||
showError(message) {
|
||||
document.getElementById('statusMessage').className = 'status error';
|
||||
document.getElementById('statusMessage').textContent = `❌ Error: ${message}`;
|
||||
}
|
||||
|
||||
addLog(message, type = 'info') {
|
||||
const container = document.getElementById('logContainer');
|
||||
const entry = document.createElement('div');
|
||||
entry.className = `log-entry ${type}`;
|
||||
const timestamp = new Date().toLocaleTimeString();
|
||||
entry.innerHTML = `<strong>[${timestamp}]</strong> ${message}`;
|
||||
|
||||
container.prepend(entry);
|
||||
container.scrollTop = 0;
|
||||
}
|
||||
|
||||
updateConnectionStatus(connected) {
|
||||
const status = document.getElementById('connectionStatus');
|
||||
if (connected) {
|
||||
status.className = 'connection-status connected';
|
||||
status.textContent = '🟢 Connected';
|
||||
} else {
|
||||
status.className = 'connection-status disconnected';
|
||||
status.textContent = '🔴 Disconnected';
|
||||
}
|
||||
}
|
||||
|
||||
disableForm() {
|
||||
document.getElementById('generateBtn').disabled = true;
|
||||
document.getElementById('generateBtn').textContent = '🔄 Generating...';
|
||||
document.getElementById('siteUrl').disabled = true;
|
||||
document.getElementById('maxDepth').disabled = true;
|
||||
}
|
||||
|
||||
enableForm() {
|
||||
document.getElementById('generateBtn').disabled = false;
|
||||
document.getElementById('generateBtn').textContent = '🚀 Generate Sitemap';
|
||||
document.getElementById('siteUrl').disabled = false;
|
||||
document.getElementById('maxDepth').disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the application
|
||||
const sitemapGen = new SitemapGenerator();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
72
main.go
Normal file
72
main.go
Normal file
@@ -0,0 +1,72 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
"sitemap-api/database"
|
||||
"sitemap-api/handlers"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/go-chi/chi/v5/middleware"
|
||||
"github.com/go-chi/cors"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Initialize database
|
||||
db, err := database.NewDB("sitemap.db")
|
||||
if err != nil {
|
||||
log.Fatal("Failed to initialize database:", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
// Initialize stream manager
|
||||
streamManager := handlers.NewStreamManager()
|
||||
|
||||
// Initialize handler
|
||||
h := handlers.NewHandler(db, streamManager)
|
||||
|
||||
// Setup router
|
||||
r := chi.NewRouter()
|
||||
|
||||
// Middleware
|
||||
r.Use(middleware.Logger)
|
||||
r.Use(middleware.Recoverer)
|
||||
r.Use(middleware.RealIP)
|
||||
r.Use(cors.Handler(cors.Options{
|
||||
AllowedOrigins: []string{"https://*", "http://*"},
|
||||
AllowedMethods: []string{"GET", "POST", "PUT", "DELETE", "OPTIONS"},
|
||||
AllowedHeaders: []string{"Accept", "Authorization", "Content-Type"},
|
||||
ExposedHeaders: []string{"Link"},
|
||||
AllowCredentials: true,
|
||||
MaxAge: 300,
|
||||
}))
|
||||
|
||||
// Serve static HTML at root
|
||||
r.Get("/", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.ServeFile(w, r, "static/index.html")
|
||||
})
|
||||
|
||||
// API Routes
|
||||
r.Post("/generate-sitemap-xml", h.GenerateSitemapXML)
|
||||
r.Get("/stream/{uuid}", h.StreamSSE)
|
||||
r.Get("/download/{uuid}", h.DownloadSitemap)
|
||||
r.Get("/sites", h.GetSites)
|
||||
r.Get("/sites/{id}", h.GetSite)
|
||||
r.Delete("/sites/{id}", h.DeleteSite)
|
||||
r.Get("/health", h.Health)
|
||||
|
||||
// Get port from environment or use default
|
||||
port := os.Getenv("PORT")
|
||||
if port == "" {
|
||||
port = "8080"
|
||||
}
|
||||
|
||||
log.Printf("Server starting on port %s...", port)
|
||||
log.Printf("Visit http://localhost:%s to use the sitemap generator", port)
|
||||
|
||||
if err := http.ListenAndServe(":"+port, r); err != nil {
|
||||
log.Fatal("Server failed to start:", err)
|
||||
}
|
||||
}
|
||||
44
run.sh
Normal file
44
run.sh
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "🗺️ XML Sitemap Generator API"
|
||||
echo "=============================="
|
||||
echo ""
|
||||
|
||||
# Check if Go is installed
|
||||
if ! command -v go &> /dev/null; then
|
||||
echo "❌ Error: Go is not installed"
|
||||
echo "Please install Go 1.21+ from https://golang.org/dl/"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✅ Go version: $(go version)"
|
||||
echo ""
|
||||
|
||||
# Install dependencies
|
||||
echo "📦 Installing dependencies..."
|
||||
go mod download
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "❌ Failed to download dependencies"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Dependencies installed"
|
||||
echo ""
|
||||
|
||||
# Build the application
|
||||
echo "🔨 Building application..."
|
||||
go build -o sitemap-api .
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "❌ Build failed"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Build successful"
|
||||
echo ""
|
||||
|
||||
# Run the application
|
||||
echo "🚀 Starting server..."
|
||||
echo ""
|
||||
echo "Server will start on http://localhost:8080"
|
||||
echo "Press Ctrl+C to stop"
|
||||
echo ""
|
||||
|
||||
./sitemap-api
|
||||
59
site.go
Normal file
59
site.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package models
|
||||
|
||||
import "time"
|
||||
|
||||
type Site struct {
|
||||
ID int `json:"id"`
|
||||
UUID string `json:"uuid"`
|
||||
Domain string `json:"domain"`
|
||||
URL string `json:"url"`
|
||||
MaxDepth int `json:"max_depth"`
|
||||
PageCount int `json:"page_count"`
|
||||
Status string `json:"status"` // processing, completed, failed
|
||||
IPAddress string `json:"ip_address"`
|
||||
UserAgent string `json:"user_agent"`
|
||||
Browser string `json:"browser"`
|
||||
BrowserVersion string `json:"browser_version"`
|
||||
OS string `json:"os"`
|
||||
DeviceType string `json:"device_type"`
|
||||
SessionID string `json:"session_id"`
|
||||
Cookies string `json:"cookies"`
|
||||
Referrer string `json:"referrer"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
||||
LastCrawled *time.Time `json:"last_crawled,omitempty"`
|
||||
}
|
||||
|
||||
type Page struct {
|
||||
ID int `json:"id"`
|
||||
SiteID int `json:"site_id"`
|
||||
URL string `json:"url"`
|
||||
Depth int `json:"depth"`
|
||||
LastModified time.Time `json:"last_modified"`
|
||||
Priority float64 `json:"priority"`
|
||||
ChangeFreq string `json:"change_freq"`
|
||||
}
|
||||
|
||||
type Event struct {
|
||||
Type string `json:"type"`
|
||||
Data interface{} `json:"data"`
|
||||
}
|
||||
|
||||
type ProgressData struct {
|
||||
UUID string `json:"uuid"`
|
||||
PagesFound int `json:"pages_found"`
|
||||
Depth int `json:"depth"`
|
||||
CurrentURL string `json:"current_url"`
|
||||
}
|
||||
|
||||
type CompleteData struct {
|
||||
UUID string `json:"uuid"`
|
||||
PagesFound int `json:"pages_found"`
|
||||
SiteID int `json:"site_id"`
|
||||
DownloadURL string `json:"download_url"`
|
||||
}
|
||||
|
||||
type ErrorData struct {
|
||||
UUID string `json:"uuid"`
|
||||
Error string `json:"error"`
|
||||
}
|
||||
Reference in New Issue
Block a user