handleRestart

This commit is contained in:
Kar
2026-02-20 20:22:57 +05:30
parent ddfec3a8e1
commit 309ba5fbae

88
main.go
View File

@@ -164,6 +164,16 @@ func openDomainDB(domain string) (*sql.DB, error) {
db.Close() db.Close()
return nil, err return nil, err
} }
_, err = db.Exec(`
CREATE TABLE IF NOT EXISTS queue (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL UNIQUE,
added_at DATETIME NOT NULL
)`)
if err != nil {
db.Close()
return nil, err
}
domainDBsMu.Lock() domainDBsMu.Lock()
domainDBs[domain] = db domainDBs[domain] = db
@@ -190,6 +200,57 @@ func isURLKnown(db *sql.DB, rawURL string) bool {
return c > 0 return c > 0
} }
// ── persistent queue helpers ──────────────────────────────────────
// enqueueURL adds a URL to the persistent queue if not already there
// and not already crawled.
func enqueueURL(db *sql.DB, rawURL string) {
now := time.Now().UTC().Format(time.RFC3339)
db.Exec(`INSERT OR IGNORE INTO queue (url, added_at) VALUES (?, ?)`, rawURL, now)
}
// dequeueURL removes and returns the oldest queued URL (FIFO).
// Returns "", false when the queue is empty.
func dequeueURL(db *sql.DB) (string, bool) {
tx, err := db.Begin()
if err != nil {
return "", false
}
defer tx.Rollback() //nolint:errcheck
var id int64
var rawURL string
err = tx.QueryRow(`SELECT id, url FROM queue ORDER BY id ASC LIMIT 1`).Scan(&id, &rawURL)
if err != nil {
return "", false // empty
}
if _, err = tx.Exec(`DELETE FROM queue WHERE id = ?`, id); err != nil {
return "", false
}
if err = tx.Commit(); err != nil {
return "", false
}
return rawURL, true
}
// queueLen returns the current number of pending URLs.
func queueLen(db *sql.DB) int {
var n int
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&n)
return n
}
// seedQueue inserts the start URL only when the queue is completely empty
// (first ever run). On restart the persisted queue is used as-is.
func seedQueue(db *sql.DB, startURL string) {
var qCount, uCount int
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&qCount)
db.QueryRow(`SELECT COUNT(1) FROM urls`).Scan(&uCount)
if qCount == 0 && uCount == 0 {
enqueueURL(db, startURL)
}
}
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
// robots.txt (minimal, single-pass parser) // robots.txt (minimal, single-pass parser)
// ───────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────
@@ -312,9 +373,12 @@ func crawlDomain(domain string, intervalSec int) {
"effective_delay": intervalSec, "effective_delay": intervalSec,
}) })
// ── BFS queue ─────────────────────────────────────────────── // ── Persistent BFS queue ────────────────────────────────────
// On first run: seed with the start URL.
// On restart: the queue table already holds the pending URLs —
// we just continue from where we left off.
startURL := "https://" + domain + "/" startURL := "https://" + domain + "/"
queue := []string{startURL} seedQueue(db, startURL)
httpClient := &http.Client{ httpClient := &http.Client{
Timeout: 30 * time.Second, Timeout: 30 * time.Second,
@@ -326,16 +390,20 @@ func crawlDomain(domain string, intervalSec int) {
}, },
} }
for len(queue) > 0 { for {
// Re-read interval in case it was updated via API // Re-read interval in case it was updated via API
var cur int var cur int
if err := mainDB.QueryRow(`SELECT interval FROM domains WHERE domain=?`, domain).Scan(&cur); err == nil && cur > 0 { if err := mainDB.QueryRow(`SELECT interval FROM domains WHERE domain=?`, domain).Scan(&cur); err == nil && cur > 0 {
intervalSec = cur intervalSec = cur
} }
target := queue[0] target, ok := dequeueURL(db)
queue = queue[1:] if !ok {
break // queue exhausted
}
// Skip if already crawled (can happen if same URL was enqueued
// multiple times before being dequeued, or after a re-seed)
if isURLKnown(db, target) { if isURLKnown(db, target) {
continue continue
} }
@@ -356,7 +424,7 @@ func crawlDomain(domain string, intervalSec int) {
emit(br, "waiting", map[string]interface{}{ emit(br, "waiting", map[string]interface{}{
"url": target, "url": target,
"delay_s": delaySec, "delay_s": delaySec,
"queue": len(queue), "queue": queueLen(db),
}) })
time.Sleep(delay) time.Sleep(delay)
@@ -366,6 +434,8 @@ func crawlDomain(domain string, intervalSec int) {
if err != nil { if err != nil {
emit(br, "error", map[string]string{"url": target, "msg": err.Error()}) emit(br, "error", map[string]string{"url": target, "msg": err.Error()})
log.Printf("[%s] fetch error %s: %v", domain, target, err) log.Printf("[%s] fetch error %s: %v", domain, target, err)
// Re-enqueue so it's retried next run
enqueueURL(db, target)
continue continue
} }
@@ -389,13 +459,13 @@ func crawlDomain(domain string, intervalSec int) {
log.Printf("[%s] saved: %s", domain, target) log.Printf("[%s] saved: %s", domain, target)
} }
// discover links // discover links from HTML pages
if isHTML && resp.StatusCode == 200 { if isHTML && resp.StatusCode == 200 {
links := extractLinks(parsed, bodyStr) links := extractLinks(parsed, bodyStr)
newCount := 0 newCount := 0
for _, link := range links { for _, link := range links {
if !isURLKnown(db, link) { if !isURLKnown(db, link) {
queue = append(queue, link) enqueueURL(db, link)
newCount++ newCount++
} }
} }
@@ -403,7 +473,7 @@ func crawlDomain(domain string, intervalSec int) {
"url": target, "url": target,
"found": len(links), "found": len(links),
"new": newCount, "new": newCount,
"queue_len": len(queue), "queue_len": queueLen(db),
}) })
} }
} }