288 lines
5.8 KiB
Go
288 lines
5.8 KiB
Go
package crawler
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"net/url"
|
|
"sitemap-api/database"
|
|
"sitemap-api/models"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
type Crawler struct {
|
|
db *database.DB
|
|
maxDepth int
|
|
visited map[string]bool
|
|
mu sync.Mutex
|
|
baseURL *url.URL
|
|
client *http.Client
|
|
eventChan chan models.Event
|
|
uuid string
|
|
siteID int
|
|
currentDepth int
|
|
totalPages int
|
|
}
|
|
|
|
func NewCrawler(db *database.DB) *Crawler {
|
|
return &Crawler{
|
|
db: db,
|
|
client: &http.Client{
|
|
Timeout: 10 * time.Second,
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
if len(via) >= 10 {
|
|
return fmt.Errorf("too many redirects")
|
|
}
|
|
return nil
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func (c *Crawler) Crawl(uuid string, startURL string, maxDepth int, eventChan chan models.Event) {
|
|
c.uuid = uuid
|
|
c.maxDepth = maxDepth
|
|
c.eventChan = eventChan
|
|
c.visited = make(map[string]bool)
|
|
c.totalPages = 0
|
|
|
|
// Parse base URL
|
|
parsedURL, err := url.Parse(startURL)
|
|
if err != nil {
|
|
c.sendEvent("error", models.ErrorData{
|
|
UUID: uuid,
|
|
Error: fmt.Sprintf("Invalid URL: %v", err),
|
|
})
|
|
return
|
|
}
|
|
c.baseURL = parsedURL
|
|
|
|
// Get site from database
|
|
site, err := c.db.GetSiteByUUID(uuid)
|
|
if err != nil {
|
|
c.sendEvent("error", models.ErrorData{
|
|
UUID: uuid,
|
|
Error: fmt.Sprintf("Failed to get site: %v", err),
|
|
})
|
|
return
|
|
}
|
|
c.siteID = site.ID
|
|
|
|
// Send started event
|
|
c.sendEvent("started", map[string]interface{}{
|
|
"uuid": uuid,
|
|
"url": startURL,
|
|
"max_depth": maxDepth,
|
|
})
|
|
|
|
// Start crawling from root
|
|
c.crawlURL(startURL, 0)
|
|
|
|
// Mark as completed
|
|
err = c.db.UpdateSiteStatus(uuid, "completed", c.totalPages)
|
|
if err != nil {
|
|
c.sendEvent("error", models.ErrorData{
|
|
UUID: uuid,
|
|
Error: fmt.Sprintf("Failed to update status: %v", err),
|
|
})
|
|
return
|
|
}
|
|
|
|
// Send completion event
|
|
c.sendEvent("complete", models.CompleteData{
|
|
UUID: uuid,
|
|
PagesFound: c.totalPages,
|
|
SiteID: c.siteID,
|
|
DownloadURL: fmt.Sprintf("/download/%s", uuid),
|
|
})
|
|
}
|
|
|
|
func (c *Crawler) crawlURL(urlStr string, depth int) {
|
|
// Check depth limit
|
|
if depth > c.maxDepth {
|
|
return
|
|
}
|
|
|
|
// Normalize URL
|
|
normalizedURL := c.normalizeURL(urlStr)
|
|
if normalizedURL == "" {
|
|
return
|
|
}
|
|
|
|
// Check if already visited
|
|
c.mu.Lock()
|
|
if c.visited[normalizedURL] {
|
|
c.mu.Unlock()
|
|
return
|
|
}
|
|
c.visited[normalizedURL] = true
|
|
c.totalPages++
|
|
currentTotal := c.totalPages
|
|
c.currentDepth = depth
|
|
c.mu.Unlock()
|
|
|
|
// Send progress event
|
|
c.sendEvent("progress", models.ProgressData{
|
|
UUID: c.uuid,
|
|
PagesFound: currentTotal,
|
|
Depth: depth,
|
|
CurrentURL: normalizedURL,
|
|
})
|
|
|
|
// Save page to database
|
|
priority := c.calculatePriority(depth)
|
|
page := &models.Page{
|
|
SiteID: c.siteID,
|
|
URL: normalizedURL,
|
|
Depth: depth,
|
|
LastModified: time.Now(),
|
|
Priority: priority,
|
|
ChangeFreq: "monthly",
|
|
}
|
|
|
|
if err := c.db.AddPage(page); err != nil {
|
|
// Log error but continue crawling
|
|
fmt.Printf("Failed to save page %s: %v\n", normalizedURL, err)
|
|
}
|
|
|
|
// Fetch the page
|
|
resp, err := c.client.Get(normalizedURL)
|
|
if err != nil {
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Only process HTML pages
|
|
contentType := resp.Header.Get("Content-Type")
|
|
if !strings.Contains(contentType, "text/html") {
|
|
return
|
|
}
|
|
|
|
// Parse HTML and extract links
|
|
links := c.extractLinks(resp)
|
|
|
|
// Crawl found links concurrently (with limited concurrency)
|
|
var wg sync.WaitGroup
|
|
semaphore := make(chan struct{}, 5) // Limit to 5 concurrent requests
|
|
|
|
for _, link := range links {
|
|
if depth+1 <= c.maxDepth {
|
|
wg.Add(1)
|
|
go func(l string) {
|
|
defer wg.Done()
|
|
semaphore <- struct{}{} // Acquire
|
|
c.crawlURL(l, depth+1)
|
|
<-semaphore // Release
|
|
}(link)
|
|
}
|
|
}
|
|
|
|
wg.Wait()
|
|
}
|
|
|
|
func (c *Crawler) extractLinks(resp *http.Response) []string {
|
|
var links []string
|
|
tokenizer := html.NewTokenizer(resp.Body)
|
|
|
|
for {
|
|
tokenType := tokenizer.Next()
|
|
if tokenType == html.ErrorToken {
|
|
break
|
|
}
|
|
|
|
if tokenType == html.StartTagToken {
|
|
token := tokenizer.Token()
|
|
if token.Data == "a" {
|
|
for _, attr := range token.Attr {
|
|
if attr.Key == "href" {
|
|
link := c.resolveURL(attr.Val)
|
|
if link != "" && c.isSameDomain(link) {
|
|
links = append(links, link)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return links
|
|
}
|
|
|
|
func (c *Crawler) resolveURL(href string) string {
|
|
parsedURL, err := url.Parse(href)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
// Resolve relative URLs
|
|
resolvedURL := c.baseURL.ResolveReference(parsedURL)
|
|
return resolvedURL.String()
|
|
}
|
|
|
|
func (c *Crawler) normalizeURL(urlStr string) string {
|
|
parsedURL, err := url.Parse(urlStr)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
// Remove fragment
|
|
parsedURL.Fragment = ""
|
|
|
|
// Remove trailing slash for consistency
|
|
parsedURL.Path = strings.TrimSuffix(parsedURL.Path, "/")
|
|
if parsedURL.Path == "" {
|
|
parsedURL.Path = "/"
|
|
}
|
|
|
|
return parsedURL.String()
|
|
}
|
|
|
|
func (c *Crawler) isSameDomain(urlStr string) bool {
|
|
parsedURL, err := url.Parse(urlStr)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
// Check if same host
|
|
if parsedURL.Host != c.baseURL.Host {
|
|
return false
|
|
}
|
|
|
|
// Skip common non-HTML files
|
|
path := strings.ToLower(parsedURL.Path)
|
|
skipExtensions := []string{".pdf", ".jpg", ".jpeg", ".png", ".gif", ".css", ".js", ".xml", ".zip", ".tar", ".gz"}
|
|
for _, ext := range skipExtensions {
|
|
if strings.HasSuffix(path, ext) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func (c *Crawler) calculatePriority(depth int) float64 {
|
|
// Homepage gets highest priority
|
|
if depth == 0 {
|
|
return 1.0
|
|
}
|
|
// Decrease priority with depth
|
|
priority := 1.0 - (float64(depth) * 0.2)
|
|
if priority < 0.3 {
|
|
priority = 0.3
|
|
}
|
|
return priority
|
|
}
|
|
|
|
func (c *Crawler) sendEvent(eventType string, data interface{}) {
|
|
if c.eventChan != nil {
|
|
select {
|
|
case c.eventChan <- models.Event{Type: eventType, Data: data}:
|
|
default:
|
|
// Channel full or closed, skip event
|
|
}
|
|
}
|
|
}
|