spiderJump
This commit is contained in:
616
main.go
616
main.go
@@ -23,33 +23,49 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
// Global state
|
// Constants & globals
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const mainDBFile = "siliconpin_spider.sqlite"
|
const mainDBFile = "siliconpin_spider.sqlite"
|
||||||
|
|
||||||
var mainDB *sql.DB
|
var mainDB *sql.DB
|
||||||
|
|
||||||
// per-domain SSE brokers
|
// SSE brokers – one per domain
|
||||||
var (
|
var (
|
||||||
brokersMu sync.RWMutex
|
brokersMu sync.RWMutex
|
||||||
brokers = map[string]*Broker{}
|
brokers = map[string]*Broker{}
|
||||||
)
|
)
|
||||||
|
|
||||||
// per-domain DB connections (kept open)
|
// open domain DB handles
|
||||||
var (
|
var (
|
||||||
domainDBsMu sync.RWMutex
|
domainDBsMu sync.RWMutex
|
||||||
domainDBs = map[string]*sql.DB{}
|
domainDBs = map[string]*sql.DB{}
|
||||||
)
|
)
|
||||||
|
|
||||||
// guard against duplicate crawlers
|
// crawler goroutine guard
|
||||||
var (
|
var (
|
||||||
crawlersMu sync.Mutex
|
crawlersMu sync.Mutex
|
||||||
crawlers = map[string]bool{}
|
crawlers = map[string]bool{}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// pause/resume channels – one per domain
|
||||||
|
// sending to pauseCh pauses; sending to resumeCh resumes
|
||||||
|
var (
|
||||||
|
pauseChsMu sync.RWMutex
|
||||||
|
pauseChs = map[string]chan struct{}{} // pause signal
|
||||||
|
resumeChs = map[string]chan struct{}{} // resume signal
|
||||||
|
)
|
||||||
|
|
||||||
|
// domain status values stored in main DB
|
||||||
|
const (
|
||||||
|
statusRunning = "running"
|
||||||
|
statusPaused = "paused"
|
||||||
|
statusDone = "done"
|
||||||
|
statusPending = "pending"
|
||||||
|
)
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
// SSE Broker – fan-out to multiple subscribers per domain
|
// SSE Broker
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
type Broker struct {
|
type Broker struct {
|
||||||
@@ -57,9 +73,7 @@ type Broker struct {
|
|||||||
clients map[chan string]struct{}
|
clients map[chan string]struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func newBroker() *Broker {
|
func newBroker() *Broker { return &Broker{clients: make(map[chan string]struct{})} }
|
||||||
return &Broker{clients: make(map[chan string]struct{})}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *Broker) subscribe() chan string {
|
func (b *Broker) subscribe() chan string {
|
||||||
ch := make(chan string, 64)
|
ch := make(chan string, 64)
|
||||||
@@ -81,7 +95,7 @@ func (b *Broker) publish(msg string) {
|
|||||||
for ch := range b.clients {
|
for ch := range b.clients {
|
||||||
select {
|
select {
|
||||||
case ch <- msg:
|
case ch <- msg:
|
||||||
default: // slow client – drop message
|
default:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -103,22 +117,29 @@ func getBroker(domain string) *Broker {
|
|||||||
return br
|
return br
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
type ssePayload struct {
|
||||||
// SSE event helper
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
type sseEvent struct {
|
|
||||||
Event string `json:"event"`
|
Event string `json:"event"`
|
||||||
Data interface{} `json:"data"`
|
Data interface{} `json:"data"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func emit(br *Broker, event string, data interface{}) {
|
func emit(br *Broker, event string, data interface{}) {
|
||||||
payload, _ := json.Marshal(sseEvent{Event: event, Data: data})
|
b, _ := json.Marshal(ssePayload{Event: event, Data: data})
|
||||||
br.publish(string(payload))
|
br.publish(string(b))
|
||||||
|
}
|
||||||
|
|
||||||
|
// broadcast emits to ALL domain brokers (e.g. for a new_domain event)
|
||||||
|
func broadcast(event string, data interface{}) {
|
||||||
|
brokersMu.RLock()
|
||||||
|
defer brokersMu.RUnlock()
|
||||||
|
b, _ := json.Marshal(ssePayload{Event: event, Data: data})
|
||||||
|
msg := string(b)
|
||||||
|
for _, br := range brokers {
|
||||||
|
br.publish(msg)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
// Database helpers
|
// Main DB helpers
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func initMainDB() {
|
func initMainDB() {
|
||||||
@@ -132,6 +153,8 @@ func initMainDB() {
|
|||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
domain TEXT NOT NULL UNIQUE,
|
domain TEXT NOT NULL UNIQUE,
|
||||||
interval INTEGER NOT NULL DEFAULT 60,
|
interval INTEGER NOT NULL DEFAULT 60,
|
||||||
|
status TEXT NOT NULL DEFAULT 'pending',
|
||||||
|
parent TEXT NOT NULL DEFAULT '',
|
||||||
created_at DATETIME NOT NULL,
|
created_at DATETIME NOT NULL,
|
||||||
updated_at DATETIME NOT NULL
|
updated_at DATETIME NOT NULL
|
||||||
)`)
|
)`)
|
||||||
@@ -141,6 +164,67 @@ func initMainDB() {
|
|||||||
log.Printf("Main DB ready: %s", mainDBFile)
|
log.Printf("Main DB ready: %s", mainDBFile)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func setDomainStatus(domain, status string) {
|
||||||
|
now := time.Now().UTC().Format(time.RFC3339)
|
||||||
|
mainDB.Exec(`UPDATE domains SET status=?, updated_at=? WHERE domain=?`, status, now, domain)
|
||||||
|
}
|
||||||
|
|
||||||
|
type DomainRow struct {
|
||||||
|
ID int `json:"id"`
|
||||||
|
Domain string `json:"domain"`
|
||||||
|
Interval int `json:"interval"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Parent string `json:"parent,omitempty"`
|
||||||
|
URLCount int `json:"url_count"`
|
||||||
|
QueueLen int `json:"queue_len"`
|
||||||
|
CreatedAt string `json:"created_at"`
|
||||||
|
UpdatedAt string `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func listDomains() ([]DomainRow, error) {
|
||||||
|
rows, err := mainDB.Query(
|
||||||
|
`SELECT id, domain, interval, status, parent, created_at, updated_at
|
||||||
|
FROM domains ORDER BY id ASC`)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var out []DomainRow
|
||||||
|
for rows.Next() {
|
||||||
|
var d DomainRow
|
||||||
|
if err := rows.Scan(&d.ID, &d.Domain, &d.Interval, &d.Status,
|
||||||
|
&d.Parent, &d.CreatedAt, &d.UpdatedAt); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// get live counts from domain DB
|
||||||
|
if db, err2 := openDomainDB(d.Domain); err2 == nil {
|
||||||
|
db.QueryRow(`SELECT COUNT(1) FROM urls`).Scan(&d.URLCount)
|
||||||
|
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&d.QueueLen)
|
||||||
|
}
|
||||||
|
out = append(out, d)
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// registerDomain upserts a domain in the main DB.
|
||||||
|
// parentDomain is "" for user-added domains, otherwise the domain that found it.
|
||||||
|
func registerDomain(domain string, interval int, parentDomain string) error {
|
||||||
|
now := time.Now().UTC().Format(time.RFC3339)
|
||||||
|
_, err := mainDB.Exec(`
|
||||||
|
INSERT INTO domains (domain, interval, status, parent, created_at, updated_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT(domain) DO UPDATE SET
|
||||||
|
interval=excluded.interval,
|
||||||
|
updated_at=excluded.updated_at`,
|
||||||
|
domain, interval, statusPending, parentDomain, now, now)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
// Domain DB helpers
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func openDomainDB(domain string) (*sql.DB, error) {
|
func openDomainDB(domain string) (*sql.DB, error) {
|
||||||
domainDBsMu.RLock()
|
domainDBsMu.RLock()
|
||||||
db, ok := domainDBs[domain]
|
db, ok := domainDBs[domain]
|
||||||
@@ -153,24 +237,32 @@ func openDomainDB(domain string) (*sql.DB, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
_, err = db.Exec(`
|
if _, err = db.Exec(`
|
||||||
CREATE TABLE IF NOT EXISTS urls (
|
CREATE TABLE IF NOT EXISTS urls (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
url TEXT NOT NULL UNIQUE,
|
url TEXT NOT NULL UNIQUE,
|
||||||
created_at DATETIME NOT NULL,
|
created_at DATETIME NOT NULL,
|
||||||
updated_at DATETIME NOT NULL
|
updated_at DATETIME NOT NULL
|
||||||
)`)
|
)`); err != nil {
|
||||||
if err != nil {
|
|
||||||
db.Close()
|
db.Close()
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
_, err = db.Exec(`
|
if _, err = db.Exec(`
|
||||||
CREATE TABLE IF NOT EXISTS queue (
|
CREATE TABLE IF NOT EXISTS queue (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
url TEXT NOT NULL UNIQUE,
|
url TEXT NOT NULL UNIQUE,
|
||||||
added_at DATETIME NOT NULL
|
added_at DATETIME NOT NULL
|
||||||
)`)
|
)`); err != nil {
|
||||||
if err != nil {
|
db.Close()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
// cross-domain links discovered during crawl
|
||||||
|
if _, err = db.Exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS ext_links (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
ext_domain TEXT NOT NULL UNIQUE,
|
||||||
|
found_at DATETIME NOT NULL
|
||||||
|
)`); err != nil {
|
||||||
db.Close()
|
db.Close()
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -185,8 +277,7 @@ func insertURL(db *sql.DB, rawURL string) (bool, error) {
|
|||||||
now := time.Now().UTC().Format(time.RFC3339)
|
now := time.Now().UTC().Format(time.RFC3339)
|
||||||
res, err := db.Exec(
|
res, err := db.Exec(
|
||||||
`INSERT OR IGNORE INTO urls (url, created_at, updated_at) VALUES (?, ?, ?)`,
|
`INSERT OR IGNORE INTO urls (url, created_at, updated_at) VALUES (?, ?, ?)`,
|
||||||
rawURL, now, now,
|
rawURL, now, now)
|
||||||
)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
@@ -200,29 +291,22 @@ func isURLKnown(db *sql.DB, rawURL string) bool {
|
|||||||
return c > 0
|
return c > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── persistent queue helpers ──────────────────────────────────────
|
|
||||||
|
|
||||||
// enqueueURL adds a URL to the persistent queue if not already there
|
|
||||||
// and not already crawled.
|
|
||||||
func enqueueURL(db *sql.DB, rawURL string) {
|
func enqueueURL(db *sql.DB, rawURL string) {
|
||||||
now := time.Now().UTC().Format(time.RFC3339)
|
now := time.Now().UTC().Format(time.RFC3339)
|
||||||
db.Exec(`INSERT OR IGNORE INTO queue (url, added_at) VALUES (?, ?)`, rawURL, now)
|
db.Exec(`INSERT OR IGNORE INTO queue (url, added_at) VALUES (?, ?)`, rawURL, now)
|
||||||
}
|
}
|
||||||
|
|
||||||
// dequeueURL removes and returns the oldest queued URL (FIFO).
|
|
||||||
// Returns "", false when the queue is empty.
|
|
||||||
func dequeueURL(db *sql.DB) (string, bool) {
|
func dequeueURL(db *sql.DB) (string, bool) {
|
||||||
tx, err := db.Begin()
|
tx, err := db.Begin()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", false
|
return "", false
|
||||||
}
|
}
|
||||||
defer tx.Rollback() //nolint:errcheck
|
defer tx.Rollback() //nolint:errcheck
|
||||||
|
|
||||||
var id int64
|
var id int64
|
||||||
var rawURL string
|
var rawURL string
|
||||||
err = tx.QueryRow(`SELECT id, url FROM queue ORDER BY id ASC LIMIT 1`).Scan(&id, &rawURL)
|
if err = tx.QueryRow(`SELECT id, url FROM queue ORDER BY id ASC LIMIT 1`).
|
||||||
if err != nil {
|
Scan(&id, &rawURL); err != nil {
|
||||||
return "", false // empty
|
return "", false
|
||||||
}
|
}
|
||||||
if _, err = tx.Exec(`DELETE FROM queue WHERE id = ?`, id); err != nil {
|
if _, err = tx.Exec(`DELETE FROM queue WHERE id = ?`, id); err != nil {
|
||||||
return "", false
|
return "", false
|
||||||
@@ -233,31 +317,138 @@ func dequeueURL(db *sql.DB) (string, bool) {
|
|||||||
return rawURL, true
|
return rawURL, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// queueLen returns the current number of pending URLs.
|
|
||||||
func queueLen(db *sql.DB) int {
|
func queueLen(db *sql.DB) int {
|
||||||
var n int
|
var n int
|
||||||
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&n)
|
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&n)
|
||||||
return n
|
return n
|
||||||
}
|
}
|
||||||
|
|
||||||
// seedQueue inserts the start URL only when the queue is completely empty
|
|
||||||
// (first ever run). On restart the persisted queue is used as-is.
|
|
||||||
func seedQueue(db *sql.DB, startURL string) {
|
func seedQueue(db *sql.DB, startURL string) {
|
||||||
var qCount, uCount int
|
var qc, uc int
|
||||||
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&qCount)
|
db.QueryRow(`SELECT COUNT(1) FROM queue`).Scan(&qc)
|
||||||
db.QueryRow(`SELECT COUNT(1) FROM urls`).Scan(&uCount)
|
db.QueryRow(`SELECT COUNT(1) FROM urls`).Scan(&uc)
|
||||||
if qCount == 0 && uCount == 0 {
|
if qc == 0 && uc == 0 {
|
||||||
enqueueURL(db, startURL)
|
enqueueURL(db, startURL)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// recordExtLink saves a discovered external domain and auto-registers it.
|
||||||
|
func recordExtLink(srcDomain, extDomain string, parentInterval int) {
|
||||||
|
db, err := openDomainDB(srcDomain)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
now := time.Now().UTC().Format(time.RFC3339)
|
||||||
|
res, _ := db.Exec(
|
||||||
|
`INSERT OR IGNORE INTO ext_links (ext_domain, found_at) VALUES (?, ?)`,
|
||||||
|
extDomain, now)
|
||||||
|
n, _ := res.RowsAffected()
|
||||||
|
if n == 0 {
|
||||||
|
return // already recorded
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register in main DB (inherit parent's interval)
|
||||||
|
if err := registerDomain(extDomain, parentInterval, srcDomain); err != nil {
|
||||||
|
log.Printf("registerDomain %s (from %s): %v", extDomain, srcDomain, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("[%s] discovered external domain: %s", srcDomain, extDomain)
|
||||||
|
|
||||||
|
// Notify UI
|
||||||
|
broadcast("new_domain", map[string]string{
|
||||||
|
"domain": extDomain,
|
||||||
|
"parent": srcDomain,
|
||||||
|
})
|
||||||
|
|
||||||
|
// Init the new domain's DB and start its crawler
|
||||||
|
if _, err := openDomainDB(extDomain); err != nil {
|
||||||
|
log.Printf("openDomainDB %s: %v", extDomain, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
crawlersMu.Lock()
|
||||||
|
if !crawlers[extDomain] {
|
||||||
|
crawlers[extDomain] = true
|
||||||
|
go crawlDomain(extDomain, parentInterval)
|
||||||
|
}
|
||||||
|
crawlersMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
// robots.txt (minimal, single-pass parser)
|
// Pause / resume machinery
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func ensurePauseChannels(domain string) {
|
||||||
|
pauseChsMu.Lock()
|
||||||
|
defer pauseChsMu.Unlock()
|
||||||
|
if _, ok := pauseChs[domain]; !ok {
|
||||||
|
pauseChs[domain] = make(chan struct{}, 1)
|
||||||
|
resumeChs[domain] = make(chan struct{}, 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pauseCrawler signals the crawler to pause. Non-blocking.
|
||||||
|
func pauseCrawler(domain string) {
|
||||||
|
pauseChsMu.RLock()
|
||||||
|
ch, ok := pauseChs[domain]
|
||||||
|
pauseChsMu.RUnlock()
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case ch <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// resumeCrawler signals a paused crawler to continue. Non-blocking.
|
||||||
|
func resumeCrawler(domain string) {
|
||||||
|
pauseChsMu.RLock()
|
||||||
|
ch, ok := resumeChs[domain]
|
||||||
|
pauseChsMu.RUnlock()
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case ch <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// checkPause is called inside the crawl loop between requests.
|
||||||
|
// If a pause signal is pending it blocks until resume arrives.
|
||||||
|
func checkPause(domain string, br *Broker) {
|
||||||
|
pauseChsMu.RLock()
|
||||||
|
pCh := pauseChs[domain]
|
||||||
|
rCh := resumeChs[domain]
|
||||||
|
pauseChsMu.RUnlock()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-pCh:
|
||||||
|
setDomainStatus(domain, statusPaused)
|
||||||
|
emit(br, "paused", map[string]string{"domain": domain})
|
||||||
|
log.Printf("[%s] paused", domain)
|
||||||
|
// drain any duplicate pause signals
|
||||||
|
for len(pCh) > 0 {
|
||||||
|
<-pCh
|
||||||
|
}
|
||||||
|
// block until resume
|
||||||
|
<-rCh
|
||||||
|
setDomainStatus(domain, statusRunning)
|
||||||
|
emit(br, "resumed", map[string]string{"domain": domain})
|
||||||
|
log.Printf("[%s] resumed", domain)
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
// robots.txt
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
type robotsRules struct {
|
type robotsRules struct {
|
||||||
disallowed []string
|
disallowed []string
|
||||||
crawlDelay int // 0 = not set
|
crawlDelay int
|
||||||
}
|
}
|
||||||
|
|
||||||
func fetchRobots(domain string) *robotsRules {
|
func fetchRobots(domain string) *robotsRules {
|
||||||
@@ -277,11 +468,9 @@ func fetchRobots(domain string) *robotsRules {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
lower := strings.ToLower(line)
|
lower := strings.ToLower(line)
|
||||||
|
|
||||||
if strings.HasPrefix(lower, "user-agent:") {
|
if strings.HasPrefix(lower, "user-agent:") {
|
||||||
agent := strings.TrimSpace(line[len("user-agent:"):])
|
agent := strings.TrimSpace(line[len("user-agent:"):])
|
||||||
inSection = agent == "*" ||
|
inSection = agent == "*" || strings.EqualFold(agent, "siliconpin_spider")
|
||||||
strings.EqualFold(agent, "siliconpin_spider")
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if !inSection {
|
if !inSection {
|
||||||
@@ -310,14 +499,21 @@ func (r *robotsRules) allowed(path string) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
// Link extractor – same-host HTML links only
|
// Link extractor
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
var hrefRe = regexp.MustCompile(`(?i)href=["']([^"'#][^"']*)["']`)
|
var hrefRe = regexp.MustCompile(`(?i)href=["']([^"'#][^"']*)["']`)
|
||||||
|
|
||||||
func extractLinks(base *url.URL, body string) []string {
|
type extractedLinks struct {
|
||||||
seen := map[string]bool{}
|
sameHost []string
|
||||||
var links []string
|
external []string // distinct external hostnames (not full URLs)
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractLinks(base *url.URL, body string) extractedLinks {
|
||||||
|
seenSame := map[string]bool{}
|
||||||
|
seenExt := map[string]bool{}
|
||||||
|
var result extractedLinks
|
||||||
|
|
||||||
for _, m := range hrefRe.FindAllStringSubmatch(body, -1) {
|
for _, m := range hrefRe.FindAllStringSubmatch(body, -1) {
|
||||||
href := strings.TrimSpace(m[1])
|
href := strings.TrimSpace(m[1])
|
||||||
parsed, err := url.Parse(href)
|
parsed, err := url.Parse(href)
|
||||||
@@ -330,16 +526,25 @@ func extractLinks(base *url.URL, body string) []string {
|
|||||||
if resolved.Scheme != "http" && resolved.Scheme != "https" {
|
if resolved.Scheme != "http" && resolved.Scheme != "https" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if !strings.EqualFold(resolved.Hostname(), base.Hostname()) {
|
host := strings.ToLower(resolved.Hostname())
|
||||||
continue
|
baseHost := strings.ToLower(base.Hostname())
|
||||||
}
|
|
||||||
s := resolved.String()
|
if host == baseHost {
|
||||||
if !seen[s] {
|
s := resolved.String()
|
||||||
seen[s] = true
|
if !seenSame[s] {
|
||||||
links = append(links, s)
|
seenSame[s] = true
|
||||||
|
result.sameHost = append(result.sameHost, s)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// strip www. for normalisation
|
||||||
|
extDomain := strings.TrimPrefix(host, "www.")
|
||||||
|
if extDomain != "" && !seenExt[extDomain] && isValidDomain(extDomain) {
|
||||||
|
seenExt[extDomain] = true
|
||||||
|
result.external = append(result.external, extDomain)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return links
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
@@ -347,20 +552,21 @@ func extractLinks(base *url.URL, body string) []string {
|
|||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func crawlDomain(domain string, intervalSec int) {
|
func crawlDomain(domain string, intervalSec int) {
|
||||||
log.Printf("[%s] crawler started (base interval %ds)", domain, intervalSec)
|
log.Printf("[%s] crawler started (interval %ds)", domain, intervalSec)
|
||||||
br := getBroker(domain)
|
br := getBroker(domain)
|
||||||
|
ensurePauseChannels(domain)
|
||||||
|
|
||||||
db, err := openDomainDB(domain)
|
db, err := openDomainDB(domain)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
emit(br, "error", map[string]string{"msg": "DB error: " + err.Error()})
|
emit(br, "error", map[string]string{"msg": "DB open failed: " + err.Error()})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── robots.txt ──────────────────────────────────────────────
|
setDomainStatus(domain, statusRunning)
|
||||||
|
|
||||||
|
// robots.txt
|
||||||
emit(br, "status", map[string]string{"msg": "fetching robots.txt"})
|
emit(br, "status", map[string]string{"msg": "fetching robots.txt"})
|
||||||
robots := fetchRobots(domain)
|
robots := fetchRobots(domain)
|
||||||
|
|
||||||
// robots.txt crawl-delay overrides our setting if higher
|
|
||||||
if robots.crawlDelay > intervalSec {
|
if robots.crawlDelay > intervalSec {
|
||||||
intervalSec = robots.crawlDelay
|
intervalSec = robots.crawlDelay
|
||||||
now := time.Now().UTC().Format(time.RFC3339)
|
now := time.Now().UTC().Format(time.RFC3339)
|
||||||
@@ -373,10 +579,6 @@ func crawlDomain(domain string, intervalSec int) {
|
|||||||
"effective_delay": intervalSec,
|
"effective_delay": intervalSec,
|
||||||
})
|
})
|
||||||
|
|
||||||
// ── Persistent BFS queue ────────────────────────────────────
|
|
||||||
// On first run: seed with the start URL.
|
|
||||||
// On restart: the queue table already holds the pending URLs —
|
|
||||||
// we just continue from where we left off.
|
|
||||||
startURL := "https://" + domain + "/"
|
startURL := "https://" + domain + "/"
|
||||||
seedQueue(db, startURL)
|
seedQueue(db, startURL)
|
||||||
|
|
||||||
@@ -391,24 +593,25 @@ func crawlDomain(domain string, intervalSec int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
// Re-read interval in case it was updated via API
|
// ── pause check ─────────────────────────────────────────
|
||||||
|
checkPause(domain, br)
|
||||||
|
|
||||||
|
// ── re-read interval ────────────────────────────────────
|
||||||
var cur int
|
var cur int
|
||||||
if err := mainDB.QueryRow(`SELECT interval FROM domains WHERE domain=?`, domain).Scan(&cur); err == nil && cur > 0 {
|
if mainDB.QueryRow(`SELECT interval FROM domains WHERE domain=?`, domain).
|
||||||
|
Scan(&cur) == nil && cur > 0 {
|
||||||
intervalSec = cur
|
intervalSec = cur
|
||||||
}
|
}
|
||||||
|
|
||||||
target, ok := dequeueURL(db)
|
target, ok := dequeueURL(db)
|
||||||
if !ok {
|
if !ok {
|
||||||
break // queue exhausted
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip if already crawled (can happen if same URL was enqueued
|
|
||||||
// multiple times before being dequeued, or after a re-seed)
|
|
||||||
if isURLKnown(db, target) {
|
if isURLKnown(db, target) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// robots check
|
|
||||||
parsed, err := url.Parse(target)
|
parsed, err := url.Parse(target)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
@@ -418,24 +621,24 @@ func crawlDomain(domain string, intervalSec int) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// random delay: [interval, interval*2] seconds
|
// random delay [interval, interval*2]
|
||||||
delaySec := intervalSec + rand.Intn(intervalSec+1)
|
delaySec := intervalSec + rand.Intn(intervalSec+1)
|
||||||
delay := time.Duration(delaySec) * time.Second
|
|
||||||
emit(br, "waiting", map[string]interface{}{
|
emit(br, "waiting", map[string]interface{}{
|
||||||
"url": target,
|
"url": target,
|
||||||
"delay_s": delaySec,
|
"delay_s": delaySec,
|
||||||
"queue": queueLen(db),
|
"queue": queueLen(db),
|
||||||
})
|
})
|
||||||
time.Sleep(delay)
|
time.Sleep(time.Duration(delaySec) * time.Second)
|
||||||
|
|
||||||
|
// ── pause check after sleep (could have been paused during wait) ──
|
||||||
|
checkPause(domain, br)
|
||||||
|
|
||||||
// fetch
|
|
||||||
emit(br, "fetching", map[string]string{"url": target})
|
emit(br, "fetching", map[string]string{"url": target})
|
||||||
resp, err := httpClient.Get(target)
|
resp, err := httpClient.Get(target)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
emit(br, "error", map[string]string{"url": target, "msg": err.Error()})
|
emit(br, "error", map[string]string{"url": target, "msg": err.Error()})
|
||||||
log.Printf("[%s] fetch error %s: %v", domain, target, err)
|
log.Printf("[%s] fetch error %s: %v", domain, target, err)
|
||||||
// Re-enqueue so it's retried next run
|
enqueueURL(db, target) // retry next run
|
||||||
enqueueURL(db, target)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -444,13 +647,12 @@ func crawlDomain(domain string, intervalSec int) {
|
|||||||
|
|
||||||
var bodyStr string
|
var bodyStr string
|
||||||
if isHTML {
|
if isHTML {
|
||||||
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 5<<20)) // 5 MB cap
|
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 5<<20))
|
||||||
bodyStr = string(raw)
|
bodyStr = string(raw)
|
||||||
}
|
}
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
|
|
||||||
inserted, _ := insertURL(db, target)
|
if ins, _ := insertURL(db, target); ins {
|
||||||
if inserted {
|
|
||||||
emit(br, "saved", map[string]interface{}{
|
emit(br, "saved", map[string]interface{}{
|
||||||
"url": target,
|
"url": target,
|
||||||
"status": resp.StatusCode,
|
"status": resp.StatusCode,
|
||||||
@@ -459,11 +661,12 @@ func crawlDomain(domain string, intervalSec int) {
|
|||||||
log.Printf("[%s] saved: %s", domain, target)
|
log.Printf("[%s] saved: %s", domain, target)
|
||||||
}
|
}
|
||||||
|
|
||||||
// discover links from HTML pages
|
|
||||||
if isHTML && resp.StatusCode == 200 {
|
if isHTML && resp.StatusCode == 200 {
|
||||||
links := extractLinks(parsed, bodyStr)
|
links := extractLinks(parsed, bodyStr)
|
||||||
|
|
||||||
|
// same-host links → queue
|
||||||
newCount := 0
|
newCount := 0
|
||||||
for _, link := range links {
|
for _, link := range links.sameHost {
|
||||||
if !isURLKnown(db, link) {
|
if !isURLKnown(db, link) {
|
||||||
enqueueURL(db, link)
|
enqueueURL(db, link)
|
||||||
newCount++
|
newCount++
|
||||||
@@ -471,13 +674,20 @@ func crawlDomain(domain string, intervalSec int) {
|
|||||||
}
|
}
|
||||||
emit(br, "links_found", map[string]interface{}{
|
emit(br, "links_found", map[string]interface{}{
|
||||||
"url": target,
|
"url": target,
|
||||||
"found": len(links),
|
"found": len(links.sameHost),
|
||||||
"new": newCount,
|
"new": newCount,
|
||||||
"queue_len": queueLen(db),
|
"queue_len": queueLen(db),
|
||||||
|
"external": len(links.external),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// external domains → auto-register & crawl
|
||||||
|
for _, extDomain := range links.external {
|
||||||
|
recordExtLink(domain, extDomain, intervalSec)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setDomainStatus(domain, statusDone)
|
||||||
emit(br, "done", map[string]string{"domain": domain, "msg": "crawl complete"})
|
emit(br, "done", map[string]string{"domain": domain, "msg": "crawl complete"})
|
||||||
log.Printf("[%s] crawl complete", domain)
|
log.Printf("[%s] crawl complete", domain)
|
||||||
|
|
||||||
@@ -487,7 +697,7 @@ func crawlDomain(domain string, intervalSec int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
// HTTP handlers
|
// HTTP helpers
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func sanitizeDomain(raw string) string {
|
func sanitizeDomain(raw string) string {
|
||||||
@@ -503,37 +713,39 @@ var domainRe = regexp.MustCompile(`^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?
|
|||||||
|
|
||||||
func isValidDomain(d string) bool { return domainRe.MatchString(d) }
|
func isValidDomain(d string) bool { return domainRe.MatchString(d) }
|
||||||
|
|
||||||
|
func jsonOK(w http.ResponseWriter, code int, v interface{}) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.WriteHeader(code)
|
||||||
|
json.NewEncoder(w).Encode(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
// Handlers
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
// POST /api/add_domain
|
// POST /api/add_domain
|
||||||
func addDomainHandler(w http.ResponseWriter, r *http.Request) {
|
func addDomainHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
if r.Method != http.MethodPost {
|
if r.Method != http.MethodPost {
|
||||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var body struct {
|
var body struct {
|
||||||
Domain string `json:"domain"`
|
Domain string `json:"domain"`
|
||||||
CrawlDelay string `json:"Crawl-delay"`
|
CrawlDelay string `json:"Crawl-delay"`
|
||||||
}
|
}
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
|
|
||||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||||
w.WriteHeader(http.StatusBadRequest)
|
jsonOK(w, http.StatusBadRequest, map[string]string{"error": "invalid JSON"})
|
||||||
json.NewEncoder(w).Encode(map[string]string{"error": "invalid JSON"})
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if body.Domain == "" {
|
if body.Domain == "" {
|
||||||
w.WriteHeader(http.StatusBadRequest)
|
jsonOK(w, http.StatusBadRequest, map[string]string{"error": "domain is required"})
|
||||||
json.NewEncoder(w).Encode(map[string]string{"error": "domain is required"})
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
domain := sanitizeDomain(body.Domain)
|
domain := sanitizeDomain(body.Domain)
|
||||||
if !isValidDomain(domain) {
|
if !isValidDomain(domain) {
|
||||||
w.WriteHeader(http.StatusBadRequest)
|
jsonOK(w, http.StatusBadRequest, map[string]string{"error": "invalid domain"})
|
||||||
json.NewEncoder(w).Encode(map[string]string{"error": "invalid domain"})
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
interval := 60
|
interval := 60
|
||||||
if body.CrawlDelay != "" {
|
if body.CrawlDelay != "" {
|
||||||
fmt.Sscanf(body.CrawlDelay, "%d", &interval)
|
fmt.Sscanf(body.CrawlDelay, "%d", &interval)
|
||||||
@@ -542,25 +754,15 @@ func addDomainHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
now := time.Now().UTC().Format(time.RFC3339)
|
if err := registerDomain(domain, interval, ""); err != nil {
|
||||||
_, err := mainDB.Exec(
|
jsonOK(w, http.StatusInternalServerError, map[string]string{"error": "db error"})
|
||||||
`INSERT INTO domains (domain,interval,created_at,updated_at) VALUES (?,?,?,?)
|
|
||||||
ON CONFLICT(domain) DO UPDATE SET interval=excluded.interval, updated_at=excluded.updated_at`,
|
|
||||||
domain, interval, now, now,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
w.WriteHeader(http.StatusInternalServerError)
|
|
||||||
json.NewEncoder(w).Encode(map[string]string{"error": "db error"})
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err := openDomainDB(domain); err != nil {
|
if _, err := openDomainDB(domain); err != nil {
|
||||||
w.WriteHeader(http.StatusInternalServerError)
|
jsonOK(w, http.StatusInternalServerError, map[string]string{"error": "domain DB init failed"})
|
||||||
json.NewEncoder(w).Encode(map[string]string{"error": "domain DB init failed"})
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// start crawler if not already running
|
|
||||||
crawlersMu.Lock()
|
crawlersMu.Lock()
|
||||||
if !crawlers[domain] {
|
if !crawlers[domain] {
|
||||||
crawlers[domain] = true
|
crawlers[domain] = true
|
||||||
@@ -568,8 +770,9 @@ func addDomainHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
crawlersMu.Unlock()
|
crawlersMu.Unlock()
|
||||||
|
|
||||||
w.WriteHeader(http.StatusCreated)
|
broadcast("new_domain", map[string]string{"domain": domain, "parent": ""})
|
||||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
||||||
|
jsonOK(w, http.StatusCreated, map[string]interface{}{
|
||||||
"message": "domain added, crawler started",
|
"message": "domain added, crawler started",
|
||||||
"domain": domain,
|
"domain": domain,
|
||||||
"interval": interval,
|
"interval": interval,
|
||||||
@@ -578,35 +781,98 @@ func addDomainHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// GET /api/sse/{domain}
|
// GET /api/domains
|
||||||
func sseHandler(w http.ResponseWriter, r *http.Request) {
|
func domainsHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
rawDomain := strings.TrimPrefix(r.URL.Path, "/api/sse/")
|
if r.Method != http.MethodGet {
|
||||||
domain := sanitizeDomain(rawDomain)
|
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||||
if !isValidDomain(domain) {
|
|
||||||
http.Error(w, "invalid domain", http.StatusBadRequest)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
list, err := listDomains()
|
||||||
|
if err != nil {
|
||||||
|
jsonOK(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if list == nil {
|
||||||
|
list = []DomainRow{}
|
||||||
|
}
|
||||||
|
jsonOK(w, http.StatusOK, list)
|
||||||
|
}
|
||||||
|
|
||||||
|
// POST /api/pause/{domain}
|
||||||
|
func pauseHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.Method != http.MethodPost {
|
||||||
|
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
domain := sanitizeDomain(strings.TrimPrefix(r.URL.Path, "/api/pause/"))
|
||||||
|
if !isValidDomain(domain) {
|
||||||
|
jsonOK(w, http.StatusBadRequest, map[string]string{"error": "invalid domain"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
crawlersMu.Lock()
|
||||||
|
running := crawlers[domain]
|
||||||
|
crawlersMu.Unlock()
|
||||||
|
|
||||||
|
if !running {
|
||||||
|
jsonOK(w, http.StatusConflict, map[string]string{"error": "crawler not running for this domain"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
pauseCrawler(domain)
|
||||||
|
jsonOK(w, http.StatusOK, map[string]string{"message": "pause signal sent", "domain": domain})
|
||||||
|
}
|
||||||
|
|
||||||
|
// POST /api/resume/{domain}
|
||||||
|
func resumeHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.Method != http.MethodPost {
|
||||||
|
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
domain := sanitizeDomain(strings.TrimPrefix(r.URL.Path, "/api/resume/"))
|
||||||
|
if !isValidDomain(domain) {
|
||||||
|
jsonOK(w, http.StatusBadRequest, map[string]string{"error": "invalid domain"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
resumeCrawler(domain)
|
||||||
|
jsonOK(w, http.StatusOK, map[string]string{"message": "resume signal sent", "domain": domain})
|
||||||
|
}
|
||||||
|
|
||||||
|
// GET /api/sse/{domain} — or /api/sse/ (global stream for all domains)
|
||||||
|
func sseHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
|
rawDomain := strings.TrimPrefix(r.URL.Path, "/api/sse/")
|
||||||
|
rawDomain = strings.TrimRight(rawDomain, "/")
|
||||||
|
|
||||||
flusher, ok := w.(http.Flusher)
|
flusher, ok := w.(http.Flusher)
|
||||||
if !ok {
|
if !ok {
|
||||||
http.Error(w, "streaming not supported", http.StatusInternalServerError)
|
http.Error(w, "streaming not supported", http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "text/event-stream")
|
w.Header().Set("Content-Type", "text/event-stream")
|
||||||
w.Header().Set("Cache-Control", "no-cache")
|
w.Header().Set("Cache-Control", "no-cache")
|
||||||
w.Header().Set("Connection", "keep-alive")
|
w.Header().Set("Connection", "keep-alive")
|
||||||
w.Header().Set("X-Accel-Buffering", "no") // nginx: disable proxy buffering
|
w.Header().Set("X-Accel-Buffering", "no")
|
||||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||||
|
|
||||||
br := getBroker(domain)
|
// If no domain specified, subscribe to the global "__all__" broker
|
||||||
|
// which receives broadcasts (new_domain, shutdown, etc.)
|
||||||
|
domainKey := rawDomain
|
||||||
|
if domainKey == "" {
|
||||||
|
domainKey = "__all__"
|
||||||
|
} else {
|
||||||
|
domainKey = sanitizeDomain(domainKey)
|
||||||
|
if !isValidDomain(domainKey) && domainKey != "__all__" {
|
||||||
|
http.Error(w, "invalid domain", http.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
br := getBroker(domainKey)
|
||||||
ch := br.subscribe()
|
ch := br.subscribe()
|
||||||
defer br.unsubscribe(ch)
|
defer br.unsubscribe(ch)
|
||||||
|
|
||||||
log.Printf("[SSE] client connected → %s", domain)
|
log.Printf("[SSE] client connected → %s", domainKey)
|
||||||
|
fmt.Fprintf(w, "data: {\"event\":\"connected\",\"data\":{\"domain\":%q}}\n\n", domainKey)
|
||||||
// send immediate connected event
|
|
||||||
fmt.Fprintf(w, "data: {\"event\":\"connected\",\"data\":{\"domain\":%q}}\n\n", domain)
|
|
||||||
flusher.Flush()
|
flusher.Flush()
|
||||||
|
|
||||||
ticker := time.NewTicker(25 * time.Second)
|
ticker := time.NewTicker(25 * time.Second)
|
||||||
@@ -615,7 +881,7 @@ func sseHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-r.Context().Done():
|
case <-r.Context().Done():
|
||||||
log.Printf("[SSE] client disconnected → %s", domain)
|
log.Printf("[SSE] client disconnected → %s", domainKey)
|
||||||
return
|
return
|
||||||
case msg := <-ch:
|
case msg := <-ch:
|
||||||
fmt.Fprintf(w, "data: %s\n\n", msg)
|
fmt.Fprintf(w, "data: %s\n\n", msg)
|
||||||
@@ -627,6 +893,27 @@ func sseHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
// broadcast helper – publish to __all__ broker
|
||||||
|
// ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// ensure the global broadcast broker always exists
|
||||||
|
getBroker("__all__")
|
||||||
|
}
|
||||||
|
|
||||||
|
// override broadcast to also send to __all__
|
||||||
|
func broadcastAll(event string, data interface{}) {
|
||||||
|
b, _ := json.Marshal(ssePayload{Event: event, Data: data})
|
||||||
|
msg := string(b)
|
||||||
|
|
||||||
|
brokersMu.RLock()
|
||||||
|
defer brokersMu.RUnlock()
|
||||||
|
for _, br := range brokers {
|
||||||
|
br.publish(msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
// main
|
// main
|
||||||
// ─────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────
|
||||||
@@ -640,20 +927,28 @@ func main() {
|
|||||||
|
|
||||||
initMainDB()
|
initMainDB()
|
||||||
|
|
||||||
// Resume any domains already in the DB from a previous run
|
// Resume domains from previous run
|
||||||
rows, err := mainDB.Query(`SELECT domain, interval FROM domains`)
|
rows, err := mainDB.Query(`SELECT domain, interval, status FROM domains`)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
var d string
|
var d, status string
|
||||||
var iv int
|
var iv int
|
||||||
if rows.Scan(&d, &iv) == nil {
|
if rows.Scan(&d, &iv, &status) != nil {
|
||||||
crawlersMu.Lock()
|
continue
|
||||||
if !crawlers[d] {
|
|
||||||
crawlers[d] = true
|
|
||||||
go crawlDomain(d, iv)
|
|
||||||
}
|
|
||||||
crawlersMu.Unlock()
|
|
||||||
}
|
}
|
||||||
|
// don't restart completed or paused crawls automatically;
|
||||||
|
// only restart those that were mid-flight (running/pending)
|
||||||
|
if status == statusDone {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
crawlersMu.Lock()
|
||||||
|
if !crawlers[d] {
|
||||||
|
crawlers[d] = true
|
||||||
|
// reset status so it shows running
|
||||||
|
setDomainStatus(d, statusPending)
|
||||||
|
go crawlDomain(d, iv)
|
||||||
|
}
|
||||||
|
crawlersMu.Unlock()
|
||||||
}
|
}
|
||||||
rows.Close()
|
rows.Close()
|
||||||
}
|
}
|
||||||
@@ -661,61 +956,46 @@ func main() {
|
|||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
mux.Handle("/", http.FileServer(http.Dir("./static")))
|
mux.Handle("/", http.FileServer(http.Dir("./static")))
|
||||||
mux.HandleFunc("/api/add_domain", addDomainHandler)
|
mux.HandleFunc("/api/add_domain", addDomainHandler)
|
||||||
|
mux.HandleFunc("/api/domains", domainsHandler)
|
||||||
|
mux.HandleFunc("/api/pause/", pauseHandler)
|
||||||
|
mux.HandleFunc("/api/resume/", resumeHandler)
|
||||||
mux.HandleFunc("/api/sse/", sseHandler)
|
mux.HandleFunc("/api/sse/", sseHandler)
|
||||||
|
|
||||||
srv := &http.Server{
|
srv := &http.Server{Addr: ":8080", Handler: mux}
|
||||||
Addr: ":8080",
|
|
||||||
Handler: mux,
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Graceful shutdown ────────────────────────────────────────
|
|
||||||
quit := make(chan os.Signal, 1)
|
quit := make(chan os.Signal, 1)
|
||||||
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
log.Printf("siliconpin_spider listening on %s", srv.Addr)
|
log.Printf("siliconpin_spider listening on :8080")
|
||||||
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||||
log.Fatalf("server error: %v", err)
|
log.Fatalf("server: %v", err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
sig := <-quit
|
<-quit
|
||||||
log.Printf("received %s — shutting down gracefully…", sig)
|
log.Println("shutting down…")
|
||||||
|
|
||||||
// 1. Stop accepting new HTTP requests; give in-flight ones 10s to finish
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
if err := srv.Shutdown(ctx); err != nil {
|
srv.Shutdown(ctx) //nolint:errcheck
|
||||||
log.Printf("HTTP shutdown error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. Notify all SSE clients
|
|
||||||
brokersMu.RLock()
|
brokersMu.RLock()
|
||||||
for domain, br := range brokers {
|
for d, br := range brokers {
|
||||||
emit(br, "shutdown", map[string]string{"domain": domain, "msg": "server stopping"})
|
emit(br, "shutdown", map[string]string{"domain": d, "msg": "server stopping"})
|
||||||
}
|
}
|
||||||
brokersMu.RUnlock()
|
brokersMu.RUnlock()
|
||||||
|
|
||||||
// Brief pause so SSE messages flush to clients
|
|
||||||
time.Sleep(500 * time.Millisecond)
|
time.Sleep(500 * time.Millisecond)
|
||||||
|
|
||||||
// 3. Checkpoint WAL → merge pending writes into the .sqlite file
|
|
||||||
// After this the .sqlite is fully self-contained (no WAL needed).
|
|
||||||
domainDBsMu.RLock()
|
domainDBsMu.RLock()
|
||||||
for domain, db := range domainDBs {
|
for d, db := range domainDBs {
|
||||||
if _, err := db.Exec(`PRAGMA wal_checkpoint(TRUNCATE)`); err != nil {
|
db.Exec(`PRAGMA wal_checkpoint(TRUNCATE)`) //nolint:errcheck
|
||||||
log.Printf("checkpoint %s: %v", domain, err)
|
|
||||||
} else {
|
|
||||||
log.Printf("checkpointed %s.sqlite", domain)
|
|
||||||
}
|
|
||||||
db.Close()
|
db.Close()
|
||||||
|
log.Printf("checkpointed %s.sqlite", d)
|
||||||
}
|
}
|
||||||
domainDBsMu.RUnlock()
|
domainDBsMu.RUnlock()
|
||||||
|
|
||||||
if _, err := mainDB.Exec(`PRAGMA wal_checkpoint(TRUNCATE)`); err != nil {
|
mainDB.Exec(`PRAGMA wal_checkpoint(TRUNCATE)`) //nolint:errcheck
|
||||||
log.Printf("checkpoint main DB: %v", err)
|
|
||||||
}
|
|
||||||
mainDB.Close()
|
mainDB.Close()
|
||||||
|
log.Println("goodbye.")
|
||||||
log.Println("shutdown complete — all WAL data flushed, goodbye.")
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,134 +5,379 @@
|
|||||||
<meta name="viewport" content="width=device-width,initial-scale=1"/>
|
<meta name="viewport" content="width=device-width,initial-scale=1"/>
|
||||||
<title>SiliconPin Spider</title>
|
<title>SiliconPin Spider</title>
|
||||||
<style>
|
<style>
|
||||||
*{box-sizing:border-box;margin:0;padding:0}
|
*{box-sizing:border-box;margin:0;padding:0}
|
||||||
body{font-family:'Segoe UI',sans-serif;background:#0f1117;color:#e0e0e0;min-height:100vh;padding:32px 20px}
|
body{font-family:'Segoe UI',system-ui,sans-serif;background:#0d1117;color:#c9d1d9;min-height:100vh}
|
||||||
h1{color:#58a6ff;font-size:1.8rem;margin-bottom:4px}
|
a{color:#58a6ff;text-decoration:none}
|
||||||
.sub{color:#8b949e;font-size:.9rem;margin-bottom:32px}
|
|
||||||
.card{background:#161b22;border:1px solid #30363d;border-radius:10px;padding:24px;max-width:680px;margin-bottom:24px}
|
/* ── layout ─────────────────────────── */
|
||||||
h2{font-size:1rem;color:#cdd9e5;margin-bottom:16px}
|
.layout{display:grid;grid-template-columns:320px 1fr;height:100vh;overflow:hidden}
|
||||||
label{display:block;font-size:.82rem;color:#8b949e;margin-bottom:4px}
|
.sidebar{background:#161b22;border-right:1px solid #30363d;display:flex;flex-direction:column;overflow:hidden}
|
||||||
input{width:100%;padding:8px 12px;background:#0d1117;border:1px solid #30363d;border-radius:6px;color:#e0e0e0;font-size:.92rem;outline:none;transition:border .2s}
|
.main{display:flex;flex-direction:column;overflow:hidden}
|
||||||
input:focus{border-color:#58a6ff}
|
|
||||||
.row{display:flex;gap:12px;margin-bottom:14px}
|
/* ── sidebar header ─────────────────── */
|
||||||
.row>div{flex:1}
|
.sidebar-header{padding:16px;border-bottom:1px solid #30363d;flex-shrink:0}
|
||||||
button{padding:9px 22px;background:#238636;border:none;border-radius:6px;color:#fff;font-size:.9rem;cursor:pointer;transition:background .2s}
|
.sidebar-header h1{font-size:1.1rem;color:#58a6ff;display:flex;align-items:center;gap:6px}
|
||||||
button:hover{background:#2ea043}
|
.sidebar-header p{font-size:.75rem;color:#8b949e;margin-top:3px}
|
||||||
#log{background:#0d1117;border:1px solid #30363d;border-radius:8px;padding:16px;height:340px;overflow-y:auto;font-size:.8rem;font-family:monospace;margin-top:12px}
|
|
||||||
.ev{padding:3px 0;border-bottom:1px solid #1c2128;display:flex;gap:8px;align-items:flex-start}
|
/* ── add domain form ────────────────── */
|
||||||
.ev:last-child{border-bottom:none}
|
.add-form{padding:12px 16px;border-bottom:1px solid #30363d;flex-shrink:0}
|
||||||
.badge{font-size:.7rem;padding:2px 7px;border-radius:12px;white-space:nowrap;font-weight:600}
|
.add-form .row{display:flex;gap:6px;margin-bottom:8px}
|
||||||
.connected{background:#1f4e79;color:#79c0ff}
|
.add-form input{flex:1;padding:6px 10px;background:#0d1117;border:1px solid #30363d;border-radius:6px;color:#c9d1d9;font-size:.82rem;outline:none}
|
||||||
.status {background:#2d333b;color:#cdd9e5}
|
.add-form input:focus{border-color:#58a6ff}
|
||||||
.robots {background:#3b2300;color:#f0883e}
|
.add-form input.narrow{max-width:80px;flex:none}
|
||||||
.waiting {background:#1c2a1e;color:#56d364}
|
.btn{padding:6px 14px;border:none;border-radius:6px;cursor:pointer;font-size:.82rem;font-weight:600;transition:opacity .15s}
|
||||||
.fetching {background:#172033;color:#79c0ff}
|
.btn:hover{opacity:.85}
|
||||||
.saved {background:#0d2818;color:#56d364}
|
.btn-green{background:#238636;color:#fff}
|
||||||
.links_found{background:#1f2d3d;color:#a5d6ff}
|
.btn-gray{background:#30363d;color:#c9d1d9}
|
||||||
.skipped {background:#2d2d00;color:#e3b341}
|
.btn-yellow{background:#9e6a03;color:#fff}
|
||||||
.error {background:#3d0000;color:#f85149}
|
.btn-blue{background:#1f6feb;color:#fff}
|
||||||
.done {background:#1f4e2c;color:#56d364}
|
.add-result{font-size:.75rem;color:#8b949e;min-height:16px}
|
||||||
.keepalive{background:#2d333b;color:#484f58;font-style:italic}
|
|
||||||
.ev-body{word-break:break-all;color:#cdd9e5}
|
/* ── domain list ────────────────────── */
|
||||||
.status-dot{width:8px;height:8px;border-radius:50%;background:#484f58;display:inline-block;margin-right:6px;flex-shrink:0;margin-top:4px}
|
.domain-list{flex:1;overflow-y:auto;padding:8px 0}
|
||||||
.status-dot.live{background:#56d364;animation:pulse 1.5s infinite}
|
.domain-card{padding:10px 16px;cursor:pointer;border-left:3px solid transparent;transition:background .12s}
|
||||||
@keyframes pulse{0%,100%{opacity:1}50%{opacity:.4}}
|
.domain-card:hover{background:#1c2128}
|
||||||
.conn-row{display:flex;align-items:center;gap:8px;margin-bottom:8px}
|
.domain-card.active{background:#1c2128;border-left-color:#58a6ff}
|
||||||
|
.domain-card .dc-name{font-size:.88rem;font-weight:600;color:#e6edf3;display:flex;align-items:center;gap:6px}
|
||||||
|
.domain-card .dc-meta{font-size:.72rem;color:#8b949e;margin-top:3px;display:flex;gap:10px}
|
||||||
|
.domain-card .dc-parent{font-size:.7rem;color:#6e7681;margin-top:2px}
|
||||||
|
|
||||||
|
/* ── status badge ───────────────────── */
|
||||||
|
.badge{font-size:.65rem;padding:1px 7px;border-radius:10px;font-weight:700;white-space:nowrap}
|
||||||
|
.b-running {background:#0d4429;color:#3fb950}
|
||||||
|
.b-paused {background:#3d2e00;color:#d29922}
|
||||||
|
.b-done {background:#0d2040;color:#58a6ff}
|
||||||
|
.b-pending {background:#282d33;color:#8b949e}
|
||||||
|
|
||||||
|
/* ── main area ──────────────────────── */
|
||||||
|
.main-header{padding:12px 20px;border-bottom:1px solid #30363d;display:flex;align-items:center;gap:12px;flex-shrink:0;background:#161b22}
|
||||||
|
.main-header h2{font-size:1rem;color:#e6edf3;flex:1}
|
||||||
|
.conn-dot{width:9px;height:9px;border-radius:50%;background:#3fb950;flex-shrink:0}
|
||||||
|
.conn-dot.off{background:#484f58}
|
||||||
|
.conn-dot.pulse{animation:pulse 1.5s infinite}
|
||||||
|
@keyframes pulse{0%,100%{opacity:1}50%{opacity:.3}}
|
||||||
|
|
||||||
|
.log-wrap{flex:1;overflow-y:auto;padding:10px 16px;font-family:'Cascadia Code','Fira Code',monospace;font-size:.75rem}
|
||||||
|
.ev{display:flex;gap:8px;padding:3px 0;border-bottom:1px solid #161b22;align-items:flex-start}
|
||||||
|
.ev-badge{font-size:.67rem;padding:1px 7px;border-radius:10px;font-weight:700;white-space:nowrap;flex-shrink:0;margin-top:1px}
|
||||||
|
.ev-body{word-break:break-all;color:#8b949e}
|
||||||
|
.ev-body b{color:#c9d1d9}
|
||||||
|
|
||||||
|
/* event colours */
|
||||||
|
.e-connected {background:#0d2040;color:#58a6ff}
|
||||||
|
.e-status {background:#1c2128;color:#8b949e}
|
||||||
|
.e-robots {background:#3d2200;color:#d29922}
|
||||||
|
.e-waiting {background:#0d2e1a;color:#3fb950}
|
||||||
|
.e-fetching {background:#0d2040;color:#79c0ff}
|
||||||
|
.e-saved {background:#0d2e1a;color:#3fb950}
|
||||||
|
.e-links_found {background:#112040;color:#a5d6ff}
|
||||||
|
.e-skipped {background:#2d2600;color:#d29922}
|
||||||
|
.e-error {background:#3d0000;color:#f85149}
|
||||||
|
.e-done {background:#0d2040;color:#79c0ff}
|
||||||
|
.e-paused {background:#3d2e00;color:#d29922}
|
||||||
|
.e-resumed {background:#0d2e1a;color:#3fb950}
|
||||||
|
.e-new_domain {background:#1f1640;color:#d2a8ff}
|
||||||
|
.e-shutdown {background:#2d0000;color:#f85149}
|
||||||
|
|
||||||
|
.empty{color:#484f58;padding:20px;text-align:center;font-size:.82rem}
|
||||||
|
.stats-bar{display:flex;gap:16px;padding:6px 20px;background:#0d1117;border-bottom:1px solid #21262d;font-size:.73rem;color:#8b949e;flex-shrink:0}
|
||||||
|
.stats-bar span b{color:#c9d1d9}
|
||||||
|
|
||||||
|
/* ── no-domain placeholder ──────────── */
|
||||||
|
.placeholder{flex:1;display:flex;align-items:center;justify-content:center;flex-direction:column;gap:10px;color:#484f58}
|
||||||
|
.placeholder svg{opacity:.3}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<h1>🕷 SiliconPin Spider</h1>
|
<div class="layout">
|
||||||
<p class="sub">Polite web crawler — respects robots.txt · random delay · SSE live feed</p>
|
|
||||||
|
|
||||||
<div class="card">
|
<!-- ═══════════════ SIDEBAR ═══════════════ -->
|
||||||
<h2>Add domain</h2>
|
<aside class="sidebar">
|
||||||
<div class="row">
|
<div class="sidebar-header">
|
||||||
<div>
|
<h1>🕷 SiliconPin Spider</h1>
|
||||||
<label>Domain</label>
|
<p>Polite web crawler · robots.txt · SSE live</p>
|
||||||
<input id="domain" placeholder="siliconpin.com" value=""/>
|
|
||||||
</div>
|
|
||||||
<div>
|
|
||||||
<label>Crawl-delay (s)</label>
|
|
||||||
<input id="delay" placeholder="20" value="20" style="max-width:100px"/>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
<button onclick="addDomain()">Add & Crawl</button>
|
|
||||||
<div id="addResult" style="margin-top:10px;font-size:.82rem;color:#8b949e"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="card">
|
<div class="add-form">
|
||||||
<h2>Live SSE stream</h2>
|
<div class="row">
|
||||||
<div class="conn-row">
|
<input id="inp-domain" placeholder="domain.com" autocomplete="off"/>
|
||||||
<span class="status-dot" id="dot"></span>
|
<input id="inp-delay" placeholder="delay s" class="narrow" value="20"/>
|
||||||
<input id="sseDomain" placeholder="siliconpin.com" style="flex:1"/>
|
</div>
|
||||||
<button onclick="connectSSE()">Connect</button>
|
<div class="row">
|
||||||
<button onclick="clearLog()" style="background:#30363d">Clear</button>
|
<button class="btn btn-green" onclick="addDomain()">+ Add & Crawl</button>
|
||||||
|
<button class="btn btn-gray" onclick="refreshList()">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
<div class="add-result" id="add-result"></div>
|
||||||
</div>
|
</div>
|
||||||
<div id="log"><span style="color:#484f58">— events will appear here —</span></div>
|
|
||||||
|
<div class="domain-list" id="domain-list">
|
||||||
|
<div class="empty">No domains yet</div>
|
||||||
|
</div>
|
||||||
|
</aside>
|
||||||
|
|
||||||
|
<!-- ═══════════════ MAIN ═══════════════ -->
|
||||||
|
<main class="main">
|
||||||
|
<div id="main-placeholder" class="placeholder" style="flex:1;display:flex">
|
||||||
|
<svg width="64" height="64" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1">
|
||||||
|
<circle cx="12" cy="12" r="10"/><line x1="12" y1="8" x2="12" y2="12"/>
|
||||||
|
<line x1="12" y1="16" x2="12.01" y2="16"/>
|
||||||
|
</svg>
|
||||||
|
<p>Select a domain to watch its live feed</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="main-panel" style="display:none;flex-direction:column;flex:1;overflow:hidden">
|
||||||
|
<div class="main-header">
|
||||||
|
<span class="conn-dot off" id="conn-dot"></span>
|
||||||
|
<h2 id="panel-title">—</h2>
|
||||||
|
<button class="btn btn-yellow" id="btn-pause" onclick="pauseDomain()">⏸ Pause</button>
|
||||||
|
<button class="btn btn-blue" id="btn-resume" onclick="resumeDomain()" style="display:none">▶ Resume</button>
|
||||||
|
<button class="btn btn-gray" onclick="clearLog()">✕ Clear</button>
|
||||||
|
</div>
|
||||||
|
<div class="stats-bar" id="stats-bar">
|
||||||
|
<span>URLs: <b id="stat-urls">—</b></span>
|
||||||
|
<span>Queue: <b id="stat-queue">—</b></span>
|
||||||
|
<span>Status: <b id="stat-status">—</b></span>
|
||||||
|
<span>Interval: <b id="stat-interval">—</b>s</span>
|
||||||
|
</div>
|
||||||
|
<div class="log-wrap" id="log"></div>
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
let es = null;
|
// ── state ──────────────────────────────────────────────────
|
||||||
|
let activeDomain = null;
|
||||||
|
let activeSSE = null;
|
||||||
|
let globalSSE = null;
|
||||||
|
let domainMap = {}; // domain → row data
|
||||||
|
|
||||||
|
// ── init ───────────────────────────────────────────────────
|
||||||
|
window.addEventListener('DOMContentLoaded', () => {
|
||||||
|
refreshList();
|
||||||
|
connectGlobalSSE();
|
||||||
|
setInterval(refreshList, 15000);
|
||||||
|
});
|
||||||
|
|
||||||
|
// ── global SSE (new_domain, shutdown) ──────────────────────
|
||||||
|
function connectGlobalSSE() {
|
||||||
|
if (globalSSE) globalSSE.close();
|
||||||
|
globalSSE = new EventSource('/api/sse/');
|
||||||
|
globalSSE.onmessage = e => {
|
||||||
|
try {
|
||||||
|
const obj = JSON.parse(e.data);
|
||||||
|
if (obj.event === 'new_domain') {
|
||||||
|
refreshList();
|
||||||
|
// auto-open if it was discovered from the active domain
|
||||||
|
if (activeDomain && obj.data.parent === activeDomain) {
|
||||||
|
appendLog({event:'new_domain', data:obj.data});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(_) {}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── fetch domain list ───────────────────────────────────────
|
||||||
|
async function refreshList() {
|
||||||
|
const res = await fetch('/api/domains');
|
||||||
|
if (!res.ok) return;
|
||||||
|
const list = await res.json();
|
||||||
|
domainMap = {};
|
||||||
|
list.forEach(d => domainMap[d.domain] = d);
|
||||||
|
renderList(list);
|
||||||
|
if (activeDomain && domainMap[activeDomain]) updateStats(domainMap[activeDomain]);
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderList(list) {
|
||||||
|
const el = document.getElementById('domain-list');
|
||||||
|
if (!list || list.length === 0) {
|
||||||
|
el.innerHTML = '<div class="empty">No domains yet</div>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
el.innerHTML = list.map(d => `
|
||||||
|
<div class="domain-card${d.domain === activeDomain ? ' active' : ''}"
|
||||||
|
id="dc-${esc(d.domain)}" onclick="selectDomain('${esc(d.domain)}')">
|
||||||
|
<div class="dc-name">
|
||||||
|
<span>${esc(d.domain)}</span>
|
||||||
|
<span class="badge b-${d.status}">${d.status}</span>
|
||||||
|
</div>
|
||||||
|
<div class="dc-meta">
|
||||||
|
<span>✓ ${d.url_count} urls</span>
|
||||||
|
<span>⏳ ${d.queue_len} queued</span>
|
||||||
|
<span>⏱ ${d.interval}s</span>
|
||||||
|
</div>
|
||||||
|
${d.parent ? `<div class="dc-parent">↳ from ${esc(d.parent)}</div>` : ''}
|
||||||
|
</div>`).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── select domain → connect SSE ────────────────────────────
|
||||||
|
function selectDomain(domain) {
|
||||||
|
if (activeDomain === domain) return;
|
||||||
|
activeDomain = domain;
|
||||||
|
|
||||||
|
// highlight sidebar
|
||||||
|
document.querySelectorAll('.domain-card').forEach(c => c.classList.remove('active'));
|
||||||
|
const card = document.getElementById('dc-' + domain);
|
||||||
|
if (card) card.classList.add('active');
|
||||||
|
|
||||||
|
// show panel
|
||||||
|
document.getElementById('main-placeholder').style.display = 'none';
|
||||||
|
const panel = document.getElementById('main-panel');
|
||||||
|
panel.style.display = 'flex';
|
||||||
|
document.getElementById('panel-title').textContent = domain;
|
||||||
|
clearLog();
|
||||||
|
|
||||||
|
// update stats
|
||||||
|
if (domainMap[domain]) updateStats(domainMap[domain]);
|
||||||
|
|
||||||
|
// SSE
|
||||||
|
if (activeSSE) activeSSE.close();
|
||||||
|
setDot(true);
|
||||||
|
activeSSE = new EventSource('/api/sse/' + domain);
|
||||||
|
activeSSE.onmessage = e => {
|
||||||
|
try { appendLog(JSON.parse(e.data)); } catch(_) {}
|
||||||
|
};
|
||||||
|
activeSSE.onerror = () => setDot(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateStats(d) {
|
||||||
|
document.getElementById('stat-urls').textContent = d.url_count;
|
||||||
|
document.getElementById('stat-queue').textContent = d.queue_len;
|
||||||
|
document.getElementById('stat-status').textContent = d.status;
|
||||||
|
document.getElementById('stat-interval').textContent = d.interval;
|
||||||
|
|
||||||
|
const paused = d.status === 'paused';
|
||||||
|
document.getElementById('btn-pause').style.display = paused ? 'none' : '';
|
||||||
|
document.getElementById('btn-resume').style.display = paused ? '' : 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── log rendering ───────────────────────────────────────────
|
||||||
|
function appendLog(obj) {
|
||||||
|
const event = obj.event || 'status';
|
||||||
|
const data = obj.data || {};
|
||||||
|
|
||||||
|
// update stats inline from events
|
||||||
|
if (event === 'saved' || event === 'links_found' || event === 'waiting') {
|
||||||
|
if (domainMap[activeDomain] && event === 'saved') {
|
||||||
|
domainMap[activeDomain].url_count++;
|
||||||
|
document.getElementById('stat-urls').textContent = domainMap[activeDomain].url_count;
|
||||||
|
}
|
||||||
|
if (data.queue_len !== undefined && domainMap[activeDomain]) {
|
||||||
|
domainMap[activeDomain].queue_len = data.queue_len ?? data.queue ?? 0;
|
||||||
|
document.getElementById('stat-queue').textContent = domainMap[activeDomain].queue_len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (event === 'paused' || event === 'resumed' || event === 'done') {
|
||||||
|
refreshList();
|
||||||
|
const st = event === 'paused' ? 'paused' : event === 'done' ? 'done' : 'running';
|
||||||
|
document.getElementById('stat-status').textContent = st;
|
||||||
|
document.getElementById('btn-pause').style.display = (event === 'paused') ? 'none' : '';
|
||||||
|
document.getElementById('btn-resume').style.display = (event === 'paused') ? '' : 'none';
|
||||||
|
}
|
||||||
|
if (event === 'new_domain') refreshList();
|
||||||
|
|
||||||
|
const body = formatBody(event, data);
|
||||||
|
const log = document.getElementById('log');
|
||||||
|
if (log.querySelector('.empty')) log.innerHTML = '';
|
||||||
|
|
||||||
|
const div = document.createElement('div');
|
||||||
|
div.className = 'ev';
|
||||||
|
div.innerHTML = `<span class="ev-badge e-${event}">${event}</span><span class="ev-body">${body}</span>`;
|
||||||
|
log.appendChild(div);
|
||||||
|
log.scrollTop = log.scrollHeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatBody(event, data) {
|
||||||
|
if (typeof data === 'string') return esc(data);
|
||||||
|
switch (event) {
|
||||||
|
case 'waiting':
|
||||||
|
return `<b>${esc(data.url||'')}</b> — delay <b>${data.delay_s}s</b>, queue <b>${data.queue}</b>`;
|
||||||
|
case 'fetching':
|
||||||
|
return `<b>${esc(data.url||'')}</b>`;
|
||||||
|
case 'saved':
|
||||||
|
return `<b>${esc(data.url||'')}</b> <span style="color:#6e7681">[${data.status} ${esc(data.content_type||'')}]</span>`;
|
||||||
|
case 'links_found':
|
||||||
|
return `<b>${esc(data.url||'')}</b> — found <b>${data.found}</b> same-host, <b>${data.new}</b> new, <b>${data.external||0}</b> external, queue <b>${data.queue_len}</b>`;
|
||||||
|
case 'skipped':
|
||||||
|
return `<b>${esc(data.url||'')}</b> — ${esc(data.reason||'')}`;
|
||||||
|
case 'error':
|
||||||
|
return `<b>${esc(data.url||data.msg||'')}</b>${data.url ? ' — '+esc(data.msg||'') : ''}`;
|
||||||
|
case 'robots':
|
||||||
|
return `delay <b>${data.robots_delay}s</b> → effective <b>${data.effective_delay}s</b>, disallowed: <b>${(data.disallowed||[]).length}</b>`;
|
||||||
|
case 'new_domain':
|
||||||
|
return `discovered <b>${esc(data.domain||'')}</b>${data.parent ? ` from <b>${esc(data.parent)}</b>` : ''}`;
|
||||||
|
case 'done':
|
||||||
|
return `<b>${esc(data.domain||'')}</b> — crawl complete ✓`;
|
||||||
|
case 'paused':
|
||||||
|
return `crawler paused — send <b>resume</b> to continue`;
|
||||||
|
case 'resumed':
|
||||||
|
return `crawler resumed`;
|
||||||
|
case 'connected':
|
||||||
|
return `stream connected for <b>${esc(data.domain||'')}</b>`;
|
||||||
|
default:
|
||||||
|
return esc(typeof data === 'object' ? JSON.stringify(data) : String(data));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── add domain ──────────────────────────────────────────────
|
||||||
async function addDomain() {
|
async function addDomain() {
|
||||||
const domain = document.getElementById('domain').value.trim();
|
const domain = document.getElementById('inp-domain').value.trim();
|
||||||
const delay = document.getElementById('delay').value.trim();
|
const delay = document.getElementById('inp-delay').value.trim();
|
||||||
if (!domain) { alert('Domain is required'); return; }
|
if (!domain) { showResult('Domain is required', true); return; }
|
||||||
|
|
||||||
const res = await fetch('/api/add_domain', {
|
const res = await fetch('/api/add_domain', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {'Content-Type':'application/json'},
|
headers: {'Content-Type':'application/json'},
|
||||||
body: JSON.stringify({domain, 'Crawl-delay': delay})
|
body: JSON.stringify({domain, 'Crawl-delay': delay})
|
||||||
});
|
});
|
||||||
const data = await res.json();
|
const data = await res.json();
|
||||||
const el = document.getElementById('addResult');
|
|
||||||
if (res.ok) {
|
if (res.ok) {
|
||||||
el.style.color = '#56d364';
|
showResult(`✓ ${data.message}`, false);
|
||||||
el.textContent = `✓ ${data.message} — SSE: ${data.sse}`;
|
document.getElementById('inp-domain').value = '';
|
||||||
document.getElementById('sseDomain').value = domain;
|
await refreshList();
|
||||||
|
selectDomain(data.domain);
|
||||||
} else {
|
} else {
|
||||||
el.style.color = '#f85149';
|
showResult(`✗ ${data.error}`, true);
|
||||||
el.textContent = `✗ ${data.error}`;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function connectSSE() {
|
function showResult(msg, err) {
|
||||||
const domain = document.getElementById('sseDomain').value.trim();
|
const el = document.getElementById('add-result');
|
||||||
if (!domain) { alert('Enter a domain'); return; }
|
el.textContent = msg;
|
||||||
if (es) { es.close(); }
|
el.style.color = err ? '#f85149' : '#3fb950';
|
||||||
document.getElementById('dot').className = 'status-dot live';
|
|
||||||
es = new EventSource('/api/sse/' + domain);
|
|
||||||
es.onmessage = function(e) { appendEvent(e.data); };
|
|
||||||
es.onerror = function() {
|
|
||||||
appendRaw('keepalive','connection error / closed');
|
|
||||||
document.getElementById('dot').className = 'status-dot';
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function appendEvent(raw) {
|
// ── pause / resume ──────────────────────────────────────────
|
||||||
let obj;
|
async function pauseDomain() {
|
||||||
try { obj = JSON.parse(raw); } catch(e) { appendRaw('status', raw); return; }
|
if (!activeDomain) return;
|
||||||
const event = obj.event || 'status';
|
await fetch('/api/pause/' + activeDomain, {method:'POST'});
|
||||||
const data = typeof obj.data === 'object' ? JSON.stringify(obj.data) : String(obj.data);
|
|
||||||
appendRaw(event, data);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function appendRaw(event, text) {
|
async function resumeDomain() {
|
||||||
const log = document.getElementById('log');
|
if (!activeDomain) return;
|
||||||
if (log.querySelector('span')) log.innerHTML = '';
|
await fetch('/api/resume/' + activeDomain, {method:'POST'});
|
||||||
const div = document.createElement('div');
|
|
||||||
div.className = 'ev';
|
|
||||||
div.innerHTML = `<span class="badge ${event}">${event}</span><span class="ev-body">${escHtml(text)}</span>`;
|
|
||||||
log.appendChild(div);
|
|
||||||
log.scrollTop = log.scrollHeight;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── utils ───────────────────────────────────────────────────
|
||||||
function clearLog() {
|
function clearLog() {
|
||||||
document.getElementById('log').innerHTML = '<span style="color:#484f58">— cleared —</span>';
|
document.getElementById('log').innerHTML = '<div class="empty">— waiting for events —</div>';
|
||||||
}
|
}
|
||||||
|
|
||||||
function escHtml(s) {
|
function setDot(live) {
|
||||||
return s.replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>');
|
const dot = document.getElementById('conn-dot');
|
||||||
|
dot.className = 'conn-dot' + (live ? ' pulse' : ' off');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function esc(s) {
|
||||||
|
return String(s)
|
||||||
|
.replace(/&/g,'&').replace(/</g,'<')
|
||||||
|
.replace(/>/g,'>').replace(/"/g,'"');
|
||||||
|
}
|
||||||
|
|
||||||
|
// keyboard shortcut: Enter in domain input
|
||||||
|
document.addEventListener('keydown', e => {
|
||||||
|
if (e.key === 'Enter' && document.activeElement.id === 'inp-domain') addDomain();
|
||||||
|
});
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
Reference in New Issue
Block a user