spiderJump

This commit is contained in:
Kar
2026-02-20 20:42:59 +05:30
parent 309ba5fbae
commit babb1e7da1
2 changed files with 788 additions and 263 deletions

616
main.go
View File

@@ -23,33 +23,49 @@ import (
) )
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// Global state // Constants & globals
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
const mainDBFile = "siliconpin_spider.sqlite" const mainDBFile = "siliconpin_spider.sqlite"
var mainDB *sql.DB var mainDB *sql.DB
// per-domain SSE brokers // SSE brokers one per domain
var ( var (
brokersMu sync.RWMutex brokersMu sync.RWMutex
brokers = map[string]*Broker{} brokers = map[string]*Broker{}
) )
// per-domain DB connections (kept open) // open domain DB handles
var ( var (
domainDBsMu sync.RWMutex domainDBsMu sync.RWMutex
domainDBs = map[string]*sql.DB{} domainDBs = map[string]*sql.DB{}
) )
// guard against duplicate crawlers // crawler goroutine guard
var ( var (
crawlersMu sync.Mutex crawlersMu sync.Mutex
crawlers = map[string]bool{} crawlers = map[string]bool{}
) )
// pause/resume channels one per domain
// sending to pauseCh pauses; sending to resumeCh resumes
var (
pauseChsMu sync.RWMutex
pauseChs = map[string]chan struct{}{} // pause signal
resumeChs = map[string]chan struct{}{} // resume signal
)
// domain status values stored in main DB
const (
statusRunning = "running"
statusPaused = "paused"
statusDone = "done"
statusPending = "pending"
)
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// SSE Broker fan-out to multiple subscribers per domain // SSE Broker
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
type Broker struct { type Broker struct {
@@ -57,9 +73,7 @@ type Broker struct {
clients map[chan string]struct{} clients map[chan string]struct{}
} }
func newBroker() *Broker { func newBroker() *Broker { return &Broker{clients: make(map[chan string]struct{})} }
return &Broker{clients: make(map[chan string]struct{})}
}
func (b *Broker) subscribe() chan string { func (b *Broker) subscribe() chan string {
ch := make(chan string, 64) ch := make(chan string, 64)
@@ -81,7 +95,7 @@ func (b *Broker) publish(msg string) {
for ch := range b.clients { for ch := range b.clients {
select { select {
case ch <- msg: case ch <- msg:
default: // slow client drop message default:
} }
} }
} }
@@ -103,22 +117,29 @@ func getBroker(domain string) *Broker {
return br return br
} }
// ───────────────────────────────────────────────────────────────── type ssePayload struct {
// SSE event helper
// ─────────────────────────────────────────────────────────────────
type sseEvent struct {
Event string `json:"event"` Event string `json:"event"`
Data interface{} `json:"data"` Data interface{} `json:"data"`
} }
func emit(br *Broker, event string, data interface{}) { func emit(br *Broker, event string, data interface{}) {
payload, _ := json.Marshal(sseEvent{Event: event, Data: data}) b, _ := json.Marshal(ssePayload{Event: event, Data: data})
br.publish(string(payload)) br.publish(string(b))
}
// broadcast emits to ALL domain brokers (e.g. for a new_domain event)
func broadcast(event string, data interface{}) {
brokersMu.RLock()
defer brokersMu.RUnlock()
b, _ := json.Marshal(ssePayload{Event: event, Data: data})
msg := string(b)
for _, br := range brokers {
br.publish(msg)
}
} }
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// Database helpers // Main DB helpers
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
func initMainDB() { func initMainDB() {
@@ -132,6 +153,8 @@ func initMainDB() {
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT NOT NULL UNIQUE, domain TEXT NOT NULL UNIQUE,
interval INTEGER NOT NULL DEFAULT 60, interval INTEGER NOT NULL DEFAULT 60,
status TEXT NOT NULL DEFAULT 'pending',
parent TEXT NOT NULL DEFAULT '',
created_at DATETIME NOT NULL, created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL updated_at DATETIME NOT NULL
)`) )`)
@@ -141,6 +164,67 @@ func initMainDB() {
log.Printf("Main DB ready: %s", mainDBFile) log.Printf("Main DB ready: %s", mainDBFile)
} }
func setDomainStatus(domain, status string) {
now := time.Now().UTC().Format(time.RFC3339)
mainDB.Exec(`UPDATE domains SET status=?, updated_at=? WHERE domain=?`, status, now, domain)
}
type DomainRow struct {
ID int `json:"id"`
Domain string `json:"domain"`
Interval int `json:"interval"`
Status string `json:"status"`
Parent string `json:"parent,omitempty"`
URLCount int `json:"url_count"`
QueueLen int `json:"queue_len"`
CreatedAt string `json:"created_at"`
UpdatedAt string `json:"updated_at"`
}
func listDomains() ([]DomainRow, error) {
rows, err := mainDB.Query(
`SELECT id, domain, interval, status, parent, created_at, updated_at
FROM domains ORDER BY id ASC`)
if err != nil {
return nil, err
}
defer rows.Close()
var out []DomainRow
for rows.Next() {
var d DomainRow
if err := rows.Scan(&d.ID, &d.Domain, &d.Interval, &d.Status,
&d.Parent, &d.CreatedAt, &d.UpdatedAt); err != nil {
continue
}
// get live counts from domain DB
if db, err2 := openDomainDB(d.Domain); err2 == nil {
db.QueryRow(`SELECT COUNT(1) FROM urls`).Scan(&d.URLCount)
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&d.QueueLen)
}
out = append(out, d)
}
return out, nil
}
// registerDomain upserts a domain in the main DB.
// parentDomain is "" for user-added domains, otherwise the domain that found it.
func registerDomain(domain string, interval int, parentDomain string) error {
now := time.Now().UTC().Format(time.RFC3339)
_, err := mainDB.Exec(`
INSERT INTO domains (domain, interval, status, parent, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(domain) DO UPDATE SET
interval=excluded.interval,
updated_at=excluded.updated_at`,
domain, interval, statusPending, parentDomain, now, now)
return err
}
// ─────────────────────────────────────────────────────────────────
// Domain DB helpers
// ─────────────────────────────────────────────────────────────────
func openDomainDB(domain string) (*sql.DB, error) { func openDomainDB(domain string) (*sql.DB, error) {
domainDBsMu.RLock() domainDBsMu.RLock()
db, ok := domainDBs[domain] db, ok := domainDBs[domain]
@@ -153,24 +237,32 @@ func openDomainDB(domain string) (*sql.DB, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
_, err = db.Exec(` if _, err = db.Exec(`
CREATE TABLE IF NOT EXISTS urls ( CREATE TABLE IF NOT EXISTS urls (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL UNIQUE, url TEXT NOT NULL UNIQUE,
created_at DATETIME NOT NULL, created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL updated_at DATETIME NOT NULL
)`) )`); err != nil {
if err != nil {
db.Close() db.Close()
return nil, err return nil, err
} }
_, err = db.Exec(` if _, err = db.Exec(`
CREATE TABLE IF NOT EXISTS queue ( CREATE TABLE IF NOT EXISTS queue (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL UNIQUE, url TEXT NOT NULL UNIQUE,
added_at DATETIME NOT NULL added_at DATETIME NOT NULL
)`) )`); err != nil {
if err != nil { db.Close()
return nil, err
}
// cross-domain links discovered during crawl
if _, err = db.Exec(`
CREATE TABLE IF NOT EXISTS ext_links (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ext_domain TEXT NOT NULL UNIQUE,
found_at DATETIME NOT NULL
)`); err != nil {
db.Close() db.Close()
return nil, err return nil, err
} }
@@ -185,8 +277,7 @@ func insertURL(db *sql.DB, rawURL string) (bool, error) {
now := time.Now().UTC().Format(time.RFC3339) now := time.Now().UTC().Format(time.RFC3339)
res, err := db.Exec( res, err := db.Exec(
`INSERT OR IGNORE INTO urls (url, created_at, updated_at) VALUES (?, ?, ?)`, `INSERT OR IGNORE INTO urls (url, created_at, updated_at) VALUES (?, ?, ?)`,
rawURL, now, now, rawURL, now, now)
)
if err != nil { if err != nil {
return false, err return false, err
} }
@@ -200,29 +291,22 @@ func isURLKnown(db *sql.DB, rawURL string) bool {
return c > 0 return c > 0
} }
// ── persistent queue helpers ──────────────────────────────────────
// enqueueURL adds a URL to the persistent queue if not already there
// and not already crawled.
func enqueueURL(db *sql.DB, rawURL string) { func enqueueURL(db *sql.DB, rawURL string) {
now := time.Now().UTC().Format(time.RFC3339) now := time.Now().UTC().Format(time.RFC3339)
db.Exec(`INSERT OR IGNORE INTO queue (url, added_at) VALUES (?, ?)`, rawURL, now) db.Exec(`INSERT OR IGNORE INTO queue (url, added_at) VALUES (?, ?)`, rawURL, now)
} }
// dequeueURL removes and returns the oldest queued URL (FIFO).
// Returns "", false when the queue is empty.
func dequeueURL(db *sql.DB) (string, bool) { func dequeueURL(db *sql.DB) (string, bool) {
tx, err := db.Begin() tx, err := db.Begin()
if err != nil { if err != nil {
return "", false return "", false
} }
defer tx.Rollback() //nolint:errcheck defer tx.Rollback() //nolint:errcheck
var id int64 var id int64
var rawURL string var rawURL string
err = tx.QueryRow(`SELECT id, url FROM queue ORDER BY id ASC LIMIT 1`).Scan(&id, &rawURL) if err = tx.QueryRow(`SELECT id, url FROM queue ORDER BY id ASC LIMIT 1`).
if err != nil { Scan(&id, &rawURL); err != nil {
return "", false // empty return "", false
} }
if _, err = tx.Exec(`DELETE FROM queue WHERE id = ?`, id); err != nil { if _, err = tx.Exec(`DELETE FROM queue WHERE id = ?`, id); err != nil {
return "", false return "", false
@@ -233,31 +317,138 @@ func dequeueURL(db *sql.DB) (string, bool) {
return rawURL, true return rawURL, true
} }
// queueLen returns the current number of pending URLs.
func queueLen(db *sql.DB) int { func queueLen(db *sql.DB) int {
var n int var n int
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&n) db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&n)
return n return n
} }
// seedQueue inserts the start URL only when the queue is completely empty
// (first ever run). On restart the persisted queue is used as-is.
func seedQueue(db *sql.DB, startURL string) { func seedQueue(db *sql.DB, startURL string) {
var qCount, uCount int var qc, uc int
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&qCount) db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&qc)
db.QueryRow(`SELECT COUNT(1) FROM urls`).Scan(&uCount) db.QueryRow(`SELECT COUNT(1) FROM urls`).Scan(&uc)
if qCount == 0 && uCount == 0 { if qc == 0 && uc == 0 {
enqueueURL(db, startURL) enqueueURL(db, startURL)
} }
} }
// recordExtLink saves a discovered external domain and auto-registers it.
func recordExtLink(srcDomain, extDomain string, parentInterval int) {
db, err := openDomainDB(srcDomain)
if err != nil {
return
}
now := time.Now().UTC().Format(time.RFC3339)
res, _ := db.Exec(
`INSERT OR IGNORE INTO ext_links (ext_domain, found_at) VALUES (?, ?)`,
extDomain, now)
n, _ := res.RowsAffected()
if n == 0 {
return // already recorded
}
// Register in main DB (inherit parent's interval)
if err := registerDomain(extDomain, parentInterval, srcDomain); err != nil {
log.Printf("registerDomain %s (from %s): %v", extDomain, srcDomain, err)
return
}
log.Printf("[%s] discovered external domain: %s", srcDomain, extDomain)
// Notify UI
broadcast("new_domain", map[string]string{
"domain": extDomain,
"parent": srcDomain,
})
// Init the new domain's DB and start its crawler
if _, err := openDomainDB(extDomain); err != nil {
log.Printf("openDomainDB %s: %v", extDomain, err)
return
}
crawlersMu.Lock()
if !crawlers[extDomain] {
crawlers[extDomain] = true
go crawlDomain(extDomain, parentInterval)
}
crawlersMu.Unlock()
}
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// robots.txt (minimal, single-pass parser) // Pause / resume machinery
// ─────────────────────────────────────────────────────────────────
func ensurePauseChannels(domain string) {
pauseChsMu.Lock()
defer pauseChsMu.Unlock()
if _, ok := pauseChs[domain]; !ok {
pauseChs[domain] = make(chan struct{}, 1)
resumeChs[domain] = make(chan struct{}, 1)
}
}
// pauseCrawler signals the crawler to pause. Non-blocking.
func pauseCrawler(domain string) {
pauseChsMu.RLock()
ch, ok := pauseChs[domain]
pauseChsMu.RUnlock()
if !ok {
return
}
select {
case ch <- struct{}{}:
default:
}
}
// resumeCrawler signals a paused crawler to continue. Non-blocking.
func resumeCrawler(domain string) {
pauseChsMu.RLock()
ch, ok := resumeChs[domain]
pauseChsMu.RUnlock()
if !ok {
return
}
select {
case ch <- struct{}{}:
default:
}
}
// checkPause is called inside the crawl loop between requests.
// If a pause signal is pending it blocks until resume arrives.
func checkPause(domain string, br *Broker) {
pauseChsMu.RLock()
pCh := pauseChs[domain]
rCh := resumeChs[domain]
pauseChsMu.RUnlock()
select {
case <-pCh:
setDomainStatus(domain, statusPaused)
emit(br, "paused", map[string]string{"domain": domain})
log.Printf("[%s] paused", domain)
// drain any duplicate pause signals
for len(pCh) > 0 {
<-pCh
}
// block until resume
<-rCh
setDomainStatus(domain, statusRunning)
emit(br, "resumed", map[string]string{"domain": domain})
log.Printf("[%s] resumed", domain)
default:
}
}
// ─────────────────────────────────────────────────────────────────
// robots.txt
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
type robotsRules struct { type robotsRules struct {
disallowed []string disallowed []string
crawlDelay int // 0 = not set crawlDelay int
} }
func fetchRobots(domain string) *robotsRules { func fetchRobots(domain string) *robotsRules {
@@ -277,11 +468,9 @@ func fetchRobots(domain string) *robotsRules {
continue continue
} }
lower := strings.ToLower(line) lower := strings.ToLower(line)
if strings.HasPrefix(lower, "user-agent:") { if strings.HasPrefix(lower, "user-agent:") {
agent := strings.TrimSpace(line[len("user-agent:"):]) agent := strings.TrimSpace(line[len("user-agent:"):])
inSection = agent == "*" || inSection = agent == "*" || strings.EqualFold(agent, "siliconpin_spider")
strings.EqualFold(agent, "siliconpin_spider")
continue continue
} }
if !inSection { if !inSection {
@@ -310,14 +499,21 @@ func (r *robotsRules) allowed(path string) bool {
} }
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// Link extractor same-host HTML links only // Link extractor
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
var hrefRe = regexp.MustCompile(`(?i)href=["']([^"'#][^"']*)["']`) var hrefRe = regexp.MustCompile(`(?i)href=["']([^"'#][^"']*)["']`)
func extractLinks(base *url.URL, body string) []string { type extractedLinks struct {
seen := map[string]bool{} sameHost []string
var links []string external []string // distinct external hostnames (not full URLs)
}
func extractLinks(base *url.URL, body string) extractedLinks {
seenSame := map[string]bool{}
seenExt := map[string]bool{}
var result extractedLinks
for _, m := range hrefRe.FindAllStringSubmatch(body, -1) { for _, m := range hrefRe.FindAllStringSubmatch(body, -1) {
href := strings.TrimSpace(m[1]) href := strings.TrimSpace(m[1])
parsed, err := url.Parse(href) parsed, err := url.Parse(href)
@@ -330,16 +526,25 @@ func extractLinks(base *url.URL, body string) []string {
if resolved.Scheme != "http" && resolved.Scheme != "https" { if resolved.Scheme != "http" && resolved.Scheme != "https" {
continue continue
} }
if !strings.EqualFold(resolved.Hostname(), base.Hostname()) { host := strings.ToLower(resolved.Hostname())
continue baseHost := strings.ToLower(base.Hostname())
}
s := resolved.String() if host == baseHost {
if !seen[s] { s := resolved.String()
seen[s] = true if !seenSame[s] {
links = append(links, s) seenSame[s] = true
result.sameHost = append(result.sameHost, s)
}
} else {
// strip www. for normalisation
extDomain := strings.TrimPrefix(host, "www.")
if extDomain != "" && !seenExt[extDomain] && isValidDomain(extDomain) {
seenExt[extDomain] = true
result.external = append(result.external, extDomain)
}
} }
} }
return links return result
} }
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
@@ -347,20 +552,21 @@ func extractLinks(base *url.URL, body string) []string {
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
func crawlDomain(domain string, intervalSec int) { func crawlDomain(domain string, intervalSec int) {
log.Printf("[%s] crawler started (base interval %ds)", domain, intervalSec) log.Printf("[%s] crawler started (interval %ds)", domain, intervalSec)
br := getBroker(domain) br := getBroker(domain)
ensurePauseChannels(domain)
db, err := openDomainDB(domain) db, err := openDomainDB(domain)
if err != nil { if err != nil {
emit(br, "error", map[string]string{"msg": "DB error: " + err.Error()}) emit(br, "error", map[string]string{"msg": "DB open failed: " + err.Error()})
return return
} }
// ── robots.txt ────────────────────────────────────────────── setDomainStatus(domain, statusRunning)
// robots.txt
emit(br, "status", map[string]string{"msg": "fetching robots.txt"}) emit(br, "status", map[string]string{"msg": "fetching robots.txt"})
robots := fetchRobots(domain) robots := fetchRobots(domain)
// robots.txt crawl-delay overrides our setting if higher
if robots.crawlDelay > intervalSec { if robots.crawlDelay > intervalSec {
intervalSec = robots.crawlDelay intervalSec = robots.crawlDelay
now := time.Now().UTC().Format(time.RFC3339) now := time.Now().UTC().Format(time.RFC3339)
@@ -373,10 +579,6 @@ func crawlDomain(domain string, intervalSec int) {
"effective_delay": intervalSec, "effective_delay": intervalSec,
}) })
// ── Persistent BFS queue ────────────────────────────────────
// On first run: seed with the start URL.
// On restart: the queue table already holds the pending URLs —
// we just continue from where we left off.
startURL := "https://" + domain + "/" startURL := "https://" + domain + "/"
seedQueue(db, startURL) seedQueue(db, startURL)
@@ -391,24 +593,25 @@ func crawlDomain(domain string, intervalSec int) {
} }
for { for {
// Re-read interval in case it was updated via API // ── pause check ─────────────────────────────────────────
checkPause(domain, br)
// ── re-read interval ────────────────────────────────────
var cur int var cur int
if err := mainDB.QueryRow(`SELECT interval FROM domains WHERE domain=?`, domain).Scan(&cur); err == nil && cur > 0 { if mainDB.QueryRow(`SELECT interval FROM domains WHERE domain=?`, domain).
Scan(&cur) == nil && cur > 0 {
intervalSec = cur intervalSec = cur
} }
target, ok := dequeueURL(db) target, ok := dequeueURL(db)
if !ok { if !ok {
break // queue exhausted break
} }
// Skip if already crawled (can happen if same URL was enqueued
// multiple times before being dequeued, or after a re-seed)
if isURLKnown(db, target) { if isURLKnown(db, target) {
continue continue
} }
// robots check
parsed, err := url.Parse(target) parsed, err := url.Parse(target)
if err != nil { if err != nil {
continue continue
@@ -418,24 +621,24 @@ func crawlDomain(domain string, intervalSec int) {
continue continue
} }
// random delay: [interval, interval*2] seconds // random delay [interval, interval*2]
delaySec := intervalSec + rand.Intn(intervalSec+1) delaySec := intervalSec + rand.Intn(intervalSec+1)
delay := time.Duration(delaySec) * time.Second
emit(br, "waiting", map[string]interface{}{ emit(br, "waiting", map[string]interface{}{
"url": target, "url": target,
"delay_s": delaySec, "delay_s": delaySec,
"queue": queueLen(db), "queue": queueLen(db),
}) })
time.Sleep(delay) time.Sleep(time.Duration(delaySec) * time.Second)
// ── pause check after sleep (could have been paused during wait) ──
checkPause(domain, br)
// fetch
emit(br, "fetching", map[string]string{"url": target}) emit(br, "fetching", map[string]string{"url": target})
resp, err := httpClient.Get(target) resp, err := httpClient.Get(target)
if err != nil { if err != nil {
emit(br, "error", map[string]string{"url": target, "msg": err.Error()}) emit(br, "error", map[string]string{"url": target, "msg": err.Error()})
log.Printf("[%s] fetch error %s: %v", domain, target, err) log.Printf("[%s] fetch error %s: %v", domain, target, err)
// Re-enqueue so it's retried next run enqueueURL(db, target) // retry next run
enqueueURL(db, target)
continue continue
} }
@@ -444,13 +647,12 @@ func crawlDomain(domain string, intervalSec int) {
var bodyStr string var bodyStr string
if isHTML { if isHTML {
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 5<<20)) // 5 MB cap raw, _ := io.ReadAll(io.LimitReader(resp.Body, 5<<20))
bodyStr = string(raw) bodyStr = string(raw)
} }
resp.Body.Close() resp.Body.Close()
inserted, _ := insertURL(db, target) if ins, _ := insertURL(db, target); ins {
if inserted {
emit(br, "saved", map[string]interface{}{ emit(br, "saved", map[string]interface{}{
"url": target, "url": target,
"status": resp.StatusCode, "status": resp.StatusCode,
@@ -459,11 +661,12 @@ func crawlDomain(domain string, intervalSec int) {
log.Printf("[%s] saved: %s", domain, target) log.Printf("[%s] saved: %s", domain, target)
} }
// discover links from HTML pages
if isHTML && resp.StatusCode == 200 { if isHTML && resp.StatusCode == 200 {
links := extractLinks(parsed, bodyStr) links := extractLinks(parsed, bodyStr)
// same-host links → queue
newCount := 0 newCount := 0
for _, link := range links { for _, link := range links.sameHost {
if !isURLKnown(db, link) { if !isURLKnown(db, link) {
enqueueURL(db, link) enqueueURL(db, link)
newCount++ newCount++
@@ -471,13 +674,20 @@ func crawlDomain(domain string, intervalSec int) {
} }
emit(br, "links_found", map[string]interface{}{ emit(br, "links_found", map[string]interface{}{
"url": target, "url": target,
"found": len(links), "found": len(links.sameHost),
"new": newCount, "new": newCount,
"queue_len": queueLen(db), "queue_len": queueLen(db),
"external": len(links.external),
}) })
// external domains → auto-register & crawl
for _, extDomain := range links.external {
recordExtLink(domain, extDomain, intervalSec)
}
} }
} }
setDomainStatus(domain, statusDone)
emit(br, "done", map[string]string{"domain": domain, "msg": "crawl complete"}) emit(br, "done", map[string]string{"domain": domain, "msg": "crawl complete"})
log.Printf("[%s] crawl complete", domain) log.Printf("[%s] crawl complete", domain)
@@ -487,7 +697,7 @@ func crawlDomain(domain string, intervalSec int) {
} }
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// HTTP handlers // HTTP helpers
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
func sanitizeDomain(raw string) string { func sanitizeDomain(raw string) string {
@@ -503,37 +713,39 @@ var domainRe = regexp.MustCompile(`^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?
func isValidDomain(d string) bool { return domainRe.MatchString(d) } func isValidDomain(d string) bool { return domainRe.MatchString(d) }
func jsonOK(w http.ResponseWriter, code int, v interface{}) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(code)
json.NewEncoder(w).Encode(v)
}
// ─────────────────────────────────────────────────────────────────
// Handlers
// ─────────────────────────────────────────────────────────────────
// POST /api/add_domain // POST /api/add_domain
func addDomainHandler(w http.ResponseWriter, r *http.Request) { func addDomainHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost { if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed) http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return return
} }
var body struct { var body struct {
Domain string `json:"domain"` Domain string `json:"domain"`
CrawlDelay string `json:"Crawl-delay"` CrawlDelay string `json:"Crawl-delay"`
} }
w.Header().Set("Content-Type", "application/json")
if err := json.NewDecoder(r.Body).Decode(&body); err != nil { if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
w.WriteHeader(http.StatusBadRequest) jsonOK(w, http.StatusBadRequest, map[string]string{"error": "invalid JSON"})
json.NewEncoder(w).Encode(map[string]string{"error": "invalid JSON"})
return return
} }
if body.Domain == "" { if body.Domain == "" {
w.WriteHeader(http.StatusBadRequest) jsonOK(w, http.StatusBadRequest, map[string]string{"error": "domain is required"})
json.NewEncoder(w).Encode(map[string]string{"error": "domain is required"})
return return
} }
domain := sanitizeDomain(body.Domain) domain := sanitizeDomain(body.Domain)
if !isValidDomain(domain) { if !isValidDomain(domain) {
w.WriteHeader(http.StatusBadRequest) jsonOK(w, http.StatusBadRequest, map[string]string{"error": "invalid domain"})
json.NewEncoder(w).Encode(map[string]string{"error": "invalid domain"})
return return
} }
interval := 60 interval := 60
if body.CrawlDelay != "" { if body.CrawlDelay != "" {
fmt.Sscanf(body.CrawlDelay, "%d", &interval) fmt.Sscanf(body.CrawlDelay, "%d", &interval)
@@ -542,25 +754,15 @@ func addDomainHandler(w http.ResponseWriter, r *http.Request) {
} }
} }
now := time.Now().UTC().Format(time.RFC3339) if err := registerDomain(domain, interval, ""); err != nil {
_, err := mainDB.Exec( jsonOK(w, http.StatusInternalServerError, map[string]string{"error": "db error"})
`INSERT INTO domains (domain,interval,created_at,updated_at) VALUES (?,?,?,?)
ON CONFLICT(domain) DO UPDATE SET interval=excluded.interval, updated_at=excluded.updated_at`,
domain, interval, now, now,
)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
json.NewEncoder(w).Encode(map[string]string{"error": "db error"})
return return
} }
if _, err := openDomainDB(domain); err != nil { if _, err := openDomainDB(domain); err != nil {
w.WriteHeader(http.StatusInternalServerError) jsonOK(w, http.StatusInternalServerError, map[string]string{"error": "domain DB init failed"})
json.NewEncoder(w).Encode(map[string]string{"error": "domain DB init failed"})
return return
} }
// start crawler if not already running
crawlersMu.Lock() crawlersMu.Lock()
if !crawlers[domain] { if !crawlers[domain] {
crawlers[domain] = true crawlers[domain] = true
@@ -568,8 +770,9 @@ func addDomainHandler(w http.ResponseWriter, r *http.Request) {
} }
crawlersMu.Unlock() crawlersMu.Unlock()
w.WriteHeader(http.StatusCreated) broadcast("new_domain", map[string]string{"domain": domain, "parent": ""})
json.NewEncoder(w).Encode(map[string]interface{}{
jsonOK(w, http.StatusCreated, map[string]interface{}{
"message": "domain added, crawler started", "message": "domain added, crawler started",
"domain": domain, "domain": domain,
"interval": interval, "interval": interval,
@@ -578,35 +781,98 @@ func addDomainHandler(w http.ResponseWriter, r *http.Request) {
}) })
} }
// GET /api/sse/{domain} // GET /api/domains
func sseHandler(w http.ResponseWriter, r *http.Request) { func domainsHandler(w http.ResponseWriter, r *http.Request) {
rawDomain := strings.TrimPrefix(r.URL.Path, "/api/sse/") if r.Method != http.MethodGet {
domain := sanitizeDomain(rawDomain) http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
if !isValidDomain(domain) {
http.Error(w, "invalid domain", http.StatusBadRequest)
return return
} }
list, err := listDomains()
if err != nil {
jsonOK(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
return
}
if list == nil {
list = []DomainRow{}
}
jsonOK(w, http.StatusOK, list)
}
// POST /api/pause/{domain}
func pauseHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return
}
domain := sanitizeDomain(strings.TrimPrefix(r.URL.Path, "/api/pause/"))
if !isValidDomain(domain) {
jsonOK(w, http.StatusBadRequest, map[string]string{"error": "invalid domain"})
return
}
crawlersMu.Lock()
running := crawlers[domain]
crawlersMu.Unlock()
if !running {
jsonOK(w, http.StatusConflict, map[string]string{"error": "crawler not running for this domain"})
return
}
pauseCrawler(domain)
jsonOK(w, http.StatusOK, map[string]string{"message": "pause signal sent", "domain": domain})
}
// POST /api/resume/{domain}
func resumeHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return
}
domain := sanitizeDomain(strings.TrimPrefix(r.URL.Path, "/api/resume/"))
if !isValidDomain(domain) {
jsonOK(w, http.StatusBadRequest, map[string]string{"error": "invalid domain"})
return
}
resumeCrawler(domain)
jsonOK(w, http.StatusOK, map[string]string{"message": "resume signal sent", "domain": domain})
}
// GET /api/sse/{domain} — or /api/sse/ (global stream for all domains)
func sseHandler(w http.ResponseWriter, r *http.Request) {
rawDomain := strings.TrimPrefix(r.URL.Path, "/api/sse/")
rawDomain = strings.TrimRight(rawDomain, "/")
flusher, ok := w.(http.Flusher) flusher, ok := w.(http.Flusher)
if !ok { if !ok {
http.Error(w, "streaming not supported", http.StatusInternalServerError) http.Error(w, "streaming not supported", http.StatusInternalServerError)
return return
} }
w.Header().Set("Content-Type", "text/event-stream") w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache") w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive") w.Header().Set("Connection", "keep-alive")
w.Header().Set("X-Accel-Buffering", "no") // nginx: disable proxy buffering w.Header().Set("X-Accel-Buffering", "no")
w.Header().Set("Access-Control-Allow-Origin", "*") w.Header().Set("Access-Control-Allow-Origin", "*")
br := getBroker(domain) // If no domain specified, subscribe to the global "__all__" broker
// which receives broadcasts (new_domain, shutdown, etc.)
domainKey := rawDomain
if domainKey == "" {
domainKey = "__all__"
} else {
domainKey = sanitizeDomain(domainKey)
if !isValidDomain(domainKey) && domainKey != "__all__" {
http.Error(w, "invalid domain", http.StatusBadRequest)
return
}
}
br := getBroker(domainKey)
ch := br.subscribe() ch := br.subscribe()
defer br.unsubscribe(ch) defer br.unsubscribe(ch)
log.Printf("[SSE] client connected → %s", domain) log.Printf("[SSE] client connected → %s", domainKey)
fmt.Fprintf(w, "data: {\"event\":\"connected\",\"data\":{\"domain\":%q}}\n\n", domainKey)
// send immediate connected event
fmt.Fprintf(w, "data: {\"event\":\"connected\",\"data\":{\"domain\":%q}}\n\n", domain)
flusher.Flush() flusher.Flush()
ticker := time.NewTicker(25 * time.Second) ticker := time.NewTicker(25 * time.Second)
@@ -615,7 +881,7 @@ func sseHandler(w http.ResponseWriter, r *http.Request) {
for { for {
select { select {
case <-r.Context().Done(): case <-r.Context().Done():
log.Printf("[SSE] client disconnected → %s", domain) log.Printf("[SSE] client disconnected → %s", domainKey)
return return
case msg := <-ch: case msg := <-ch:
fmt.Fprintf(w, "data: %s\n\n", msg) fmt.Fprintf(w, "data: %s\n\n", msg)
@@ -627,6 +893,27 @@ func sseHandler(w http.ResponseWriter, r *http.Request) {
} }
} }
// ─────────────────────────────────────────────────────────────────
// broadcast helper publish to __all__ broker
// ─────────────────────────────────────────────────────────────────
func init() {
// ensure the global broadcast broker always exists
getBroker("__all__")
}
// override broadcast to also send to __all__
func broadcastAll(event string, data interface{}) {
b, _ := json.Marshal(ssePayload{Event: event, Data: data})
msg := string(b)
brokersMu.RLock()
defer brokersMu.RUnlock()
for _, br := range brokers {
br.publish(msg)
}
}
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// main // main
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
@@ -640,20 +927,28 @@ func main() {
initMainDB() initMainDB()
// Resume any domains already in the DB from a previous run // Resume domains from previous run
rows, err := mainDB.Query(`SELECT domain, interval FROM domains`) rows, err := mainDB.Query(`SELECT domain, interval, status FROM domains`)
if err == nil { if err == nil {
for rows.Next() { for rows.Next() {
var d string var d, status string
var iv int var iv int
if rows.Scan(&d, &iv) == nil { if rows.Scan(&d, &iv, &status) != nil {
crawlersMu.Lock() continue
if !crawlers[d] {
crawlers[d] = true
go crawlDomain(d, iv)
}
crawlersMu.Unlock()
} }
// don't restart completed or paused crawls automatically;
// only restart those that were mid-flight (running/pending)
if status == statusDone {
continue
}
crawlersMu.Lock()
if !crawlers[d] {
crawlers[d] = true
// reset status so it shows running
setDomainStatus(d, statusPending)
go crawlDomain(d, iv)
}
crawlersMu.Unlock()
} }
rows.Close() rows.Close()
} }
@@ -661,61 +956,46 @@ func main() {
mux := http.NewServeMux() mux := http.NewServeMux()
mux.Handle("/", http.FileServer(http.Dir("./static"))) mux.Handle("/", http.FileServer(http.Dir("./static")))
mux.HandleFunc("/api/add_domain", addDomainHandler) mux.HandleFunc("/api/add_domain", addDomainHandler)
mux.HandleFunc("/api/domains", domainsHandler)
mux.HandleFunc("/api/pause/", pauseHandler)
mux.HandleFunc("/api/resume/", resumeHandler)
mux.HandleFunc("/api/sse/", sseHandler) mux.HandleFunc("/api/sse/", sseHandler)
srv := &http.Server{ srv := &http.Server{Addr: ":8080", Handler: mux}
Addr: ":8080",
Handler: mux,
}
// ── Graceful shutdown ────────────────────────────────────────
quit := make(chan os.Signal, 1) quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM) signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
go func() { go func() {
log.Printf("siliconpin_spider listening on %s", srv.Addr) log.Printf("siliconpin_spider listening on :8080")
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Fatalf("server error: %v", err) log.Fatalf("server: %v", err)
} }
}() }()
sig := <-quit <-quit
log.Printf("received %s — shutting down gracefully…", sig) log.Println("shutting down…")
// 1. Stop accepting new HTTP requests; give in-flight ones 10s to finish
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel() defer cancel()
if err := srv.Shutdown(ctx); err != nil { srv.Shutdown(ctx) //nolint:errcheck
log.Printf("HTTP shutdown error: %v", err)
}
// 2. Notify all SSE clients
brokersMu.RLock() brokersMu.RLock()
for domain, br := range brokers { for d, br := range brokers {
emit(br, "shutdown", map[string]string{"domain": domain, "msg": "server stopping"}) emit(br, "shutdown", map[string]string{"domain": d, "msg": "server stopping"})
} }
brokersMu.RUnlock() brokersMu.RUnlock()
// Brief pause so SSE messages flush to clients
time.Sleep(500 * time.Millisecond) time.Sleep(500 * time.Millisecond)
// 3. Checkpoint WAL → merge pending writes into the .sqlite file
// After this the .sqlite is fully self-contained (no WAL needed).
domainDBsMu.RLock() domainDBsMu.RLock()
for domain, db := range domainDBs { for d, db := range domainDBs {
if _, err := db.Exec(`PRAGMA wal_checkpoint(TRUNCATE)`); err != nil { db.Exec(`PRAGMA wal_checkpoint(TRUNCATE)`) //nolint:errcheck
log.Printf("checkpoint %s: %v", domain, err)
} else {
log.Printf("checkpointed %s.sqlite", domain)
}
db.Close() db.Close()
log.Printf("checkpointed %s.sqlite", d)
} }
domainDBsMu.RUnlock() domainDBsMu.RUnlock()
if _, err := mainDB.Exec(`PRAGMA wal_checkpoint(TRUNCATE)`); err != nil { mainDB.Exec(`PRAGMA wal_checkpoint(TRUNCATE)`) //nolint:errcheck
log.Printf("checkpoint main DB: %v", err)
}
mainDB.Close() mainDB.Close()
log.Println("goodbye.")
log.Println("shutdown complete — all WAL data flushed, goodbye.")
} }

View File

@@ -5,134 +5,379 @@
<meta name="viewport" content="width=device-width,initial-scale=1"/> <meta name="viewport" content="width=device-width,initial-scale=1"/>
<title>SiliconPin Spider</title> <title>SiliconPin Spider</title>
<style> <style>
*{box-sizing:border-box;margin:0;padding:0} *{box-sizing:border-box;margin:0;padding:0}
body{font-family:'Segoe UI',sans-serif;background:#0f1117;color:#e0e0e0;min-height:100vh;padding:32px 20px} body{font-family:'Segoe UI',system-ui,sans-serif;background:#0d1117;color:#c9d1d9;min-height:100vh}
h1{color:#58a6ff;font-size:1.8rem;margin-bottom:4px} a{color:#58a6ff;text-decoration:none}
.sub{color:#8b949e;font-size:.9rem;margin-bottom:32px}
.card{background:#161b22;border:1px solid #30363d;border-radius:10px;padding:24px;max-width:680px;margin-bottom:24px} /* ── layout ─────────────────────────── */
h2{font-size:1rem;color:#cdd9e5;margin-bottom:16px} .layout{display:grid;grid-template-columns:320px 1fr;height:100vh;overflow:hidden}
label{display:block;font-size:.82rem;color:#8b949e;margin-bottom:4px} .sidebar{background:#161b22;border-right:1px solid #30363d;display:flex;flex-direction:column;overflow:hidden}
input{width:100%;padding:8px 12px;background:#0d1117;border:1px solid #30363d;border-radius:6px;color:#e0e0e0;font-size:.92rem;outline:none;transition:border .2s} .main{display:flex;flex-direction:column;overflow:hidden}
input:focus{border-color:#58a6ff}
.row{display:flex;gap:12px;margin-bottom:14px} /* ── sidebar header ─────────────────── */
.row>div{flex:1} .sidebar-header{padding:16px;border-bottom:1px solid #30363d;flex-shrink:0}
button{padding:9px 22px;background:#238636;border:none;border-radius:6px;color:#fff;font-size:.9rem;cursor:pointer;transition:background .2s} .sidebar-header h1{font-size:1.1rem;color:#58a6ff;display:flex;align-items:center;gap:6px}
button:hover{background:#2ea043} .sidebar-header p{font-size:.75rem;color:#8b949e;margin-top:3px}
#log{background:#0d1117;border:1px solid #30363d;border-radius:8px;padding:16px;height:340px;overflow-y:auto;font-size:.8rem;font-family:monospace;margin-top:12px}
.ev{padding:3px 0;border-bottom:1px solid #1c2128;display:flex;gap:8px;align-items:flex-start} /* ── add domain form ────────────────── */
.ev:last-child{border-bottom:none} .add-form{padding:12px 16px;border-bottom:1px solid #30363d;flex-shrink:0}
.badge{font-size:.7rem;padding:2px 7px;border-radius:12px;white-space:nowrap;font-weight:600} .add-form .row{display:flex;gap:6px;margin-bottom:8px}
.connected{background:#1f4e79;color:#79c0ff} .add-form input{flex:1;padding:6px 10px;background:#0d1117;border:1px solid #30363d;border-radius:6px;color:#c9d1d9;font-size:.82rem;outline:none}
.status {background:#2d333b;color:#cdd9e5} .add-form input:focus{border-color:#58a6ff}
.robots {background:#3b2300;color:#f0883e} .add-form input.narrow{max-width:80px;flex:none}
.waiting {background:#1c2a1e;color:#56d364} .btn{padding:6px 14px;border:none;border-radius:6px;cursor:pointer;font-size:.82rem;font-weight:600;transition:opacity .15s}
.fetching {background:#172033;color:#79c0ff} .btn:hover{opacity:.85}
.saved {background:#0d2818;color:#56d364} .btn-green{background:#238636;color:#fff}
.links_found{background:#1f2d3d;color:#a5d6ff} .btn-gray{background:#30363d;color:#c9d1d9}
.skipped {background:#2d2d00;color:#e3b341} .btn-yellow{background:#9e6a03;color:#fff}
.error {background:#3d0000;color:#f85149} .btn-blue{background:#1f6feb;color:#fff}
.done {background:#1f4e2c;color:#56d364} .add-result{font-size:.75rem;color:#8b949e;min-height:16px}
.keepalive{background:#2d333b;color:#484f58;font-style:italic}
.ev-body{word-break:break-all;color:#cdd9e5} /* ── domain list ────────────────────── */
.status-dot{width:8px;height:8px;border-radius:50%;background:#484f58;display:inline-block;margin-right:6px;flex-shrink:0;margin-top:4px} .domain-list{flex:1;overflow-y:auto;padding:8px 0}
.status-dot.live{background:#56d364;animation:pulse 1.5s infinite} .domain-card{padding:10px 16px;cursor:pointer;border-left:3px solid transparent;transition:background .12s}
@keyframes pulse{0%,100%{opacity:1}50%{opacity:.4}} .domain-card:hover{background:#1c2128}
.conn-row{display:flex;align-items:center;gap:8px;margin-bottom:8px} .domain-card.active{background:#1c2128;border-left-color:#58a6ff}
.domain-card .dc-name{font-size:.88rem;font-weight:600;color:#e6edf3;display:flex;align-items:center;gap:6px}
.domain-card .dc-meta{font-size:.72rem;color:#8b949e;margin-top:3px;display:flex;gap:10px}
.domain-card .dc-parent{font-size:.7rem;color:#6e7681;margin-top:2px}
/* ── status badge ───────────────────── */
.badge{font-size:.65rem;padding:1px 7px;border-radius:10px;font-weight:700;white-space:nowrap}
.b-running {background:#0d4429;color:#3fb950}
.b-paused {background:#3d2e00;color:#d29922}
.b-done {background:#0d2040;color:#58a6ff}
.b-pending {background:#282d33;color:#8b949e}
/* ── main area ──────────────────────── */
.main-header{padding:12px 20px;border-bottom:1px solid #30363d;display:flex;align-items:center;gap:12px;flex-shrink:0;background:#161b22}
.main-header h2{font-size:1rem;color:#e6edf3;flex:1}
.conn-dot{width:9px;height:9px;border-radius:50%;background:#3fb950;flex-shrink:0}
.conn-dot.off{background:#484f58}
.conn-dot.pulse{animation:pulse 1.5s infinite}
@keyframes pulse{0%,100%{opacity:1}50%{opacity:.3}}
.log-wrap{flex:1;overflow-y:auto;padding:10px 16px;font-family:'Cascadia Code','Fira Code',monospace;font-size:.75rem}
.ev{display:flex;gap:8px;padding:3px 0;border-bottom:1px solid #161b22;align-items:flex-start}
.ev-badge{font-size:.67rem;padding:1px 7px;border-radius:10px;font-weight:700;white-space:nowrap;flex-shrink:0;margin-top:1px}
.ev-body{word-break:break-all;color:#8b949e}
.ev-body b{color:#c9d1d9}
/* event colours */
.e-connected {background:#0d2040;color:#58a6ff}
.e-status {background:#1c2128;color:#8b949e}
.e-robots {background:#3d2200;color:#d29922}
.e-waiting {background:#0d2e1a;color:#3fb950}
.e-fetching {background:#0d2040;color:#79c0ff}
.e-saved {background:#0d2e1a;color:#3fb950}
.e-links_found {background:#112040;color:#a5d6ff}
.e-skipped {background:#2d2600;color:#d29922}
.e-error {background:#3d0000;color:#f85149}
.e-done {background:#0d2040;color:#79c0ff}
.e-paused {background:#3d2e00;color:#d29922}
.e-resumed {background:#0d2e1a;color:#3fb950}
.e-new_domain {background:#1f1640;color:#d2a8ff}
.e-shutdown {background:#2d0000;color:#f85149}
.empty{color:#484f58;padding:20px;text-align:center;font-size:.82rem}
.stats-bar{display:flex;gap:16px;padding:6px 20px;background:#0d1117;border-bottom:1px solid #21262d;font-size:.73rem;color:#8b949e;flex-shrink:0}
.stats-bar span b{color:#c9d1d9}
/* ── no-domain placeholder ──────────── */
.placeholder{flex:1;display:flex;align-items:center;justify-content:center;flex-direction:column;gap:10px;color:#484f58}
.placeholder svg{opacity:.3}
</style> </style>
</head> </head>
<body> <body>
<h1>🕷 SiliconPin Spider</h1> <div class="layout">
<p class="sub">Polite web crawler — respects robots.txt · random delay · SSE live feed</p>
<div class="card"> <!-- ═══════════════ SIDEBAR ═══════════════ -->
<h2>Add domain</h2> <aside class="sidebar">
<div class="row"> <div class="sidebar-header">
<div> <h1>🕷 SiliconPin Spider</h1>
<label>Domain</label> <p>Polite web crawler · robots.txt · SSE live</p>
<input id="domain" placeholder="siliconpin.com" value=""/>
</div>
<div>
<label>Crawl-delay (s)</label>
<input id="delay" placeholder="20" value="20" style="max-width:100px"/>
</div>
</div> </div>
<button onclick="addDomain()">Add &amp; Crawl</button>
<div id="addResult" style="margin-top:10px;font-size:.82rem;color:#8b949e"></div>
</div>
<div class="card"> <div class="add-form">
<h2>Live SSE stream</h2> <div class="row">
<div class="conn-row"> <input id="inp-domain" placeholder="domain.com" autocomplete="off"/>
<span class="status-dot" id="dot"></span> <input id="inp-delay" placeholder="delay s" class="narrow" value="20"/>
<input id="sseDomain" placeholder="siliconpin.com" style="flex:1"/> </div>
<button onclick="connectSSE()">Connect</button> <div class="row">
<button onclick="clearLog()" style="background:#30363d">Clear</button> <button class="btn btn-green" onclick="addDomain()">+ Add &amp; Crawl</button>
<button class="btn btn-gray" onclick="refreshList()">↻ Refresh</button>
</div>
<div class="add-result" id="add-result"></div>
</div> </div>
<div id="log"><span style="color:#484f58">— events will appear here —</span></div>
<div class="domain-list" id="domain-list">
<div class="empty">No domains yet</div>
</div>
</aside>
<!-- ═══════════════ MAIN ═══════════════ -->
<main class="main">
<div id="main-placeholder" class="placeholder" style="flex:1;display:flex">
<svg width="64" height="64" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1">
<circle cx="12" cy="12" r="10"/><line x1="12" y1="8" x2="12" y2="12"/>
<line x1="12" y1="16" x2="12.01" y2="16"/>
</svg>
<p>Select a domain to watch its live feed</p>
</div>
<div id="main-panel" style="display:none;flex-direction:column;flex:1;overflow:hidden">
<div class="main-header">
<span class="conn-dot off" id="conn-dot"></span>
<h2 id="panel-title"></h2>
<button class="btn btn-yellow" id="btn-pause" onclick="pauseDomain()">⏸ Pause</button>
<button class="btn btn-blue" id="btn-resume" onclick="resumeDomain()" style="display:none">▶ Resume</button>
<button class="btn btn-gray" onclick="clearLog()">✕ Clear</button>
</div>
<div class="stats-bar" id="stats-bar">
<span>URLs: <b id="stat-urls"></b></span>
<span>Queue: <b id="stat-queue"></b></span>
<span>Status: <b id="stat-status"></b></span>
<span>Interval: <b id="stat-interval"></b>s</span>
</div>
<div class="log-wrap" id="log"></div>
</div>
</main>
</div> </div>
<script> <script>
let es = null; // ── state ──────────────────────────────────────────────────
let activeDomain = null;
let activeSSE = null;
let globalSSE = null;
let domainMap = {}; // domain → row data
// ── init ───────────────────────────────────────────────────
window.addEventListener('DOMContentLoaded', () => {
refreshList();
connectGlobalSSE();
setInterval(refreshList, 15000);
});
// ── global SSE (new_domain, shutdown) ──────────────────────
function connectGlobalSSE() {
if (globalSSE) globalSSE.close();
globalSSE = new EventSource('/api/sse/');
globalSSE.onmessage = e => {
try {
const obj = JSON.parse(e.data);
if (obj.event === 'new_domain') {
refreshList();
// auto-open if it was discovered from the active domain
if (activeDomain && obj.data.parent === activeDomain) {
appendLog({event:'new_domain', data:obj.data});
}
}
} catch(_) {}
};
}
// ── fetch domain list ───────────────────────────────────────
async function refreshList() {
const res = await fetch('/api/domains');
if (!res.ok) return;
const list = await res.json();
domainMap = {};
list.forEach(d => domainMap[d.domain] = d);
renderList(list);
if (activeDomain && domainMap[activeDomain]) updateStats(domainMap[activeDomain]);
}
function renderList(list) {
const el = document.getElementById('domain-list');
if (!list || list.length === 0) {
el.innerHTML = '<div class="empty">No domains yet</div>';
return;
}
el.innerHTML = list.map(d => `
<div class="domain-card${d.domain === activeDomain ? ' active' : ''}"
id="dc-${esc(d.domain)}" onclick="selectDomain('${esc(d.domain)}')">
<div class="dc-name">
<span>${esc(d.domain)}</span>
<span class="badge b-${d.status}">${d.status}</span>
</div>
<div class="dc-meta">
<span>✓ ${d.url_count} urls</span>
<span>⏳ ${d.queue_len} queued</span>
<span>⏱ ${d.interval}s</span>
</div>
${d.parent ? `<div class="dc-parent">↳ from ${esc(d.parent)}</div>` : ''}
</div>`).join('');
}
// ── select domain → connect SSE ────────────────────────────
function selectDomain(domain) {
if (activeDomain === domain) return;
activeDomain = domain;
// highlight sidebar
document.querySelectorAll('.domain-card').forEach(c => c.classList.remove('active'));
const card = document.getElementById('dc-' + domain);
if (card) card.classList.add('active');
// show panel
document.getElementById('main-placeholder').style.display = 'none';
const panel = document.getElementById('main-panel');
panel.style.display = 'flex';
document.getElementById('panel-title').textContent = domain;
clearLog();
// update stats
if (domainMap[domain]) updateStats(domainMap[domain]);
// SSE
if (activeSSE) activeSSE.close();
setDot(true);
activeSSE = new EventSource('/api/sse/' + domain);
activeSSE.onmessage = e => {
try { appendLog(JSON.parse(e.data)); } catch(_) {}
};
activeSSE.onerror = () => setDot(false);
}
function updateStats(d) {
document.getElementById('stat-urls').textContent = d.url_count;
document.getElementById('stat-queue').textContent = d.queue_len;
document.getElementById('stat-status').textContent = d.status;
document.getElementById('stat-interval').textContent = d.interval;
const paused = d.status === 'paused';
document.getElementById('btn-pause').style.display = paused ? 'none' : '';
document.getElementById('btn-resume').style.display = paused ? '' : 'none';
}
// ── log rendering ───────────────────────────────────────────
function appendLog(obj) {
const event = obj.event || 'status';
const data = obj.data || {};
// update stats inline from events
if (event === 'saved' || event === 'links_found' || event === 'waiting') {
if (domainMap[activeDomain] && event === 'saved') {
domainMap[activeDomain].url_count++;
document.getElementById('stat-urls').textContent = domainMap[activeDomain].url_count;
}
if (data.queue_len !== undefined && domainMap[activeDomain]) {
domainMap[activeDomain].queue_len = data.queue_len ?? data.queue ?? 0;
document.getElementById('stat-queue').textContent = domainMap[activeDomain].queue_len;
}
}
if (event === 'paused' || event === 'resumed' || event === 'done') {
refreshList();
const st = event === 'paused' ? 'paused' : event === 'done' ? 'done' : 'running';
document.getElementById('stat-status').textContent = st;
document.getElementById('btn-pause').style.display = (event === 'paused') ? 'none' : '';
document.getElementById('btn-resume').style.display = (event === 'paused') ? '' : 'none';
}
if (event === 'new_domain') refreshList();
const body = formatBody(event, data);
const log = document.getElementById('log');
if (log.querySelector('.empty')) log.innerHTML = '';
const div = document.createElement('div');
div.className = 'ev';
div.innerHTML = `<span class="ev-badge e-${event}">${event}</span><span class="ev-body">${body}</span>`;
log.appendChild(div);
log.scrollTop = log.scrollHeight;
}
function formatBody(event, data) {
if (typeof data === 'string') return esc(data);
switch (event) {
case 'waiting':
return `<b>${esc(data.url||'')}</b> — delay <b>${data.delay_s}s</b>, queue <b>${data.queue}</b>`;
case 'fetching':
return `<b>${esc(data.url||'')}</b>`;
case 'saved':
return `<b>${esc(data.url||'')}</b> <span style="color:#6e7681">[${data.status} ${esc(data.content_type||'')}]</span>`;
case 'links_found':
return `<b>${esc(data.url||'')}</b> — found <b>${data.found}</b> same-host, <b>${data.new}</b> new, <b>${data.external||0}</b> external, queue <b>${data.queue_len}</b>`;
case 'skipped':
return `<b>${esc(data.url||'')}</b> — ${esc(data.reason||'')}`;
case 'error':
return `<b>${esc(data.url||data.msg||'')}</b>${data.url ? ' — '+esc(data.msg||'') : ''}`;
case 'robots':
return `delay <b>${data.robots_delay}s</b> → effective <b>${data.effective_delay}s</b>, disallowed: <b>${(data.disallowed||[]).length}</b>`;
case 'new_domain':
return `discovered <b>${esc(data.domain||'')}</b>${data.parent ? ` from <b>${esc(data.parent)}</b>` : ''}`;
case 'done':
return `<b>${esc(data.domain||'')}</b> — crawl complete ✓`;
case 'paused':
return `crawler paused — send <b>resume</b> to continue`;
case 'resumed':
return `crawler resumed`;
case 'connected':
return `stream connected for <b>${esc(data.domain||'')}</b>`;
default:
return esc(typeof data === 'object' ? JSON.stringify(data) : String(data));
}
}
// ── add domain ──────────────────────────────────────────────
async function addDomain() { async function addDomain() {
const domain = document.getElementById('domain').value.trim(); const domain = document.getElementById('inp-domain').value.trim();
const delay = document.getElementById('delay').value.trim(); const delay = document.getElementById('inp-delay').value.trim();
if (!domain) { alert('Domain is required'); return; } if (!domain) { showResult('Domain is required', true); return; }
const res = await fetch('/api/add_domain', { const res = await fetch('/api/add_domain', {
method: 'POST', method: 'POST',
headers: {'Content-Type':'application/json'}, headers: {'Content-Type':'application/json'},
body: JSON.stringify({domain, 'Crawl-delay': delay}) body: JSON.stringify({domain, 'Crawl-delay': delay})
}); });
const data = await res.json(); const data = await res.json();
const el = document.getElementById('addResult');
if (res.ok) { if (res.ok) {
el.style.color = '#56d364'; showResult(`${data.message}`, false);
el.textContent = `${data.message} — SSE: ${data.sse}`; document.getElementById('inp-domain').value = '';
document.getElementById('sseDomain').value = domain; await refreshList();
selectDomain(data.domain);
} else { } else {
el.style.color = '#f85149'; showResult(`${data.error}`, true);
el.textContent = `${data.error}`;
} }
} }
function connectSSE() { function showResult(msg, err) {
const domain = document.getElementById('sseDomain').value.trim(); const el = document.getElementById('add-result');
if (!domain) { alert('Enter a domain'); return; } el.textContent = msg;
if (es) { es.close(); } el.style.color = err ? '#f85149' : '#3fb950';
document.getElementById('dot').className = 'status-dot live';
es = new EventSource('/api/sse/' + domain);
es.onmessage = function(e) { appendEvent(e.data); };
es.onerror = function() {
appendRaw('keepalive','connection error / closed');
document.getElementById('dot').className = 'status-dot';
};
} }
function appendEvent(raw) { // ── pause / resume ──────────────────────────────────────────
let obj; async function pauseDomain() {
try { obj = JSON.parse(raw); } catch(e) { appendRaw('status', raw); return; } if (!activeDomain) return;
const event = obj.event || 'status'; await fetch('/api/pause/' + activeDomain, {method:'POST'});
const data = typeof obj.data === 'object' ? JSON.stringify(obj.data) : String(obj.data);
appendRaw(event, data);
} }
function appendRaw(event, text) { async function resumeDomain() {
const log = document.getElementById('log'); if (!activeDomain) return;
if (log.querySelector('span')) log.innerHTML = ''; await fetch('/api/resume/' + activeDomain, {method:'POST'});
const div = document.createElement('div');
div.className = 'ev';
div.innerHTML = `<span class="badge ${event}">${event}</span><span class="ev-body">${escHtml(text)}</span>`;
log.appendChild(div);
log.scrollTop = log.scrollHeight;
} }
// ── utils ───────────────────────────────────────────────────
function clearLog() { function clearLog() {
document.getElementById('log').innerHTML = '<span style="color:#484f58">— cleared —</span>'; document.getElementById('log').innerHTML = '<div class="empty">— waiting for events —</div>';
} }
function escHtml(s) { function setDot(live) {
return s.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;'); const dot = document.getElementById('conn-dot');
dot.className = 'conn-dot' + (live ? ' pulse' : ' off');
} }
function esc(s) {
return String(s)
.replace(/&/g,'&amp;').replace(/</g,'&lt;')
.replace(/>/g,'&gt;').replace(/"/g,'&quot;');
}
// keyboard shortcut: Enter in domain input
document.addEventListener('keydown', e => {
if (e.key === 'Enter' && document.activeElement.id === 'inp-domain') addDomain();
});
</script> </script>
</body> </body>
</html> </html>