init
This commit is contained in:
287
crawler.go
Normal file
287
crawler.go
Normal file
@@ -0,0 +1,287 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sitemap-api/database"
|
||||
"sitemap-api/models"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
type Crawler struct {
|
||||
db *database.DB
|
||||
maxDepth int
|
||||
visited map[string]bool
|
||||
mu sync.Mutex
|
||||
baseURL *url.URL
|
||||
client *http.Client
|
||||
eventChan chan models.Event
|
||||
uuid string
|
||||
siteID int
|
||||
currentDepth int
|
||||
totalPages int
|
||||
}
|
||||
|
||||
func NewCrawler(db *database.DB) *Crawler {
|
||||
return &Crawler{
|
||||
db: db,
|
||||
client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Crawler) Crawl(uuid string, startURL string, maxDepth int, eventChan chan models.Event) {
|
||||
c.uuid = uuid
|
||||
c.maxDepth = maxDepth
|
||||
c.eventChan = eventChan
|
||||
c.visited = make(map[string]bool)
|
||||
c.totalPages = 0
|
||||
|
||||
// Parse base URL
|
||||
parsedURL, err := url.Parse(startURL)
|
||||
if err != nil {
|
||||
c.sendEvent("error", models.ErrorData{
|
||||
UUID: uuid,
|
||||
Error: fmt.Sprintf("Invalid URL: %v", err),
|
||||
})
|
||||
return
|
||||
}
|
||||
c.baseURL = parsedURL
|
||||
|
||||
// Get site from database
|
||||
site, err := c.db.GetSiteByUUID(uuid)
|
||||
if err != nil {
|
||||
c.sendEvent("error", models.ErrorData{
|
||||
UUID: uuid,
|
||||
Error: fmt.Sprintf("Failed to get site: %v", err),
|
||||
})
|
||||
return
|
||||
}
|
||||
c.siteID = site.ID
|
||||
|
||||
// Send started event
|
||||
c.sendEvent("started", map[string]interface{}{
|
||||
"uuid": uuid,
|
||||
"url": startURL,
|
||||
"max_depth": maxDepth,
|
||||
})
|
||||
|
||||
// Start crawling from root
|
||||
c.crawlURL(startURL, 0)
|
||||
|
||||
// Mark as completed
|
||||
err = c.db.UpdateSiteStatus(uuid, "completed", c.totalPages)
|
||||
if err != nil {
|
||||
c.sendEvent("error", models.ErrorData{
|
||||
UUID: uuid,
|
||||
Error: fmt.Sprintf("Failed to update status: %v", err),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Send completion event
|
||||
c.sendEvent("complete", models.CompleteData{
|
||||
UUID: uuid,
|
||||
PagesFound: c.totalPages,
|
||||
SiteID: c.siteID,
|
||||
DownloadURL: fmt.Sprintf("/download/%s", uuid),
|
||||
})
|
||||
}
|
||||
|
||||
func (c *Crawler) crawlURL(urlStr string, depth int) {
|
||||
// Check depth limit
|
||||
if depth > c.maxDepth {
|
||||
return
|
||||
}
|
||||
|
||||
// Normalize URL
|
||||
normalizedURL := c.normalizeURL(urlStr)
|
||||
if normalizedURL == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// Check if already visited
|
||||
c.mu.Lock()
|
||||
if c.visited[normalizedURL] {
|
||||
c.mu.Unlock()
|
||||
return
|
||||
}
|
||||
c.visited[normalizedURL] = true
|
||||
c.totalPages++
|
||||
currentTotal := c.totalPages
|
||||
c.currentDepth = depth
|
||||
c.mu.Unlock()
|
||||
|
||||
// Send progress event
|
||||
c.sendEvent("progress", models.ProgressData{
|
||||
UUID: c.uuid,
|
||||
PagesFound: currentTotal,
|
||||
Depth: depth,
|
||||
CurrentURL: normalizedURL,
|
||||
})
|
||||
|
||||
// Save page to database
|
||||
priority := c.calculatePriority(depth)
|
||||
page := &models.Page{
|
||||
SiteID: c.siteID,
|
||||
URL: normalizedURL,
|
||||
Depth: depth,
|
||||
LastModified: time.Now(),
|
||||
Priority: priority,
|
||||
ChangeFreq: "monthly",
|
||||
}
|
||||
|
||||
if err := c.db.AddPage(page); err != nil {
|
||||
// Log error but continue crawling
|
||||
fmt.Printf("Failed to save page %s: %v\n", normalizedURL, err)
|
||||
}
|
||||
|
||||
// Fetch the page
|
||||
resp, err := c.client.Get(normalizedURL)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Only process HTML pages
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
if !strings.Contains(contentType, "text/html") {
|
||||
return
|
||||
}
|
||||
|
||||
// Parse HTML and extract links
|
||||
links := c.extractLinks(resp)
|
||||
|
||||
// Crawl found links concurrently (with limited concurrency)
|
||||
var wg sync.WaitGroup
|
||||
semaphore := make(chan struct{}, 5) // Limit to 5 concurrent requests
|
||||
|
||||
for _, link := range links {
|
||||
if depth+1 <= c.maxDepth {
|
||||
wg.Add(1)
|
||||
go func(l string) {
|
||||
defer wg.Done()
|
||||
semaphore <- struct{}{} // Acquire
|
||||
c.crawlURL(l, depth+1)
|
||||
<-semaphore // Release
|
||||
}(link)
|
||||
}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func (c *Crawler) extractLinks(resp *http.Response) []string {
|
||||
var links []string
|
||||
tokenizer := html.NewTokenizer(resp.Body)
|
||||
|
||||
for {
|
||||
tokenType := tokenizer.Next()
|
||||
if tokenType == html.ErrorToken {
|
||||
break
|
||||
}
|
||||
|
||||
if tokenType == html.StartTagToken {
|
||||
token := tokenizer.Token()
|
||||
if token.Data == "a" {
|
||||
for _, attr := range token.Attr {
|
||||
if attr.Key == "href" {
|
||||
link := c.resolveURL(attr.Val)
|
||||
if link != "" && c.isSameDomain(link) {
|
||||
links = append(links, link)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return links
|
||||
}
|
||||
|
||||
func (c *Crawler) resolveURL(href string) string {
|
||||
parsedURL, err := url.Parse(href)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Resolve relative URLs
|
||||
resolvedURL := c.baseURL.ResolveReference(parsedURL)
|
||||
return resolvedURL.String()
|
||||
}
|
||||
|
||||
func (c *Crawler) normalizeURL(urlStr string) string {
|
||||
parsedURL, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Remove fragment
|
||||
parsedURL.Fragment = ""
|
||||
|
||||
// Remove trailing slash for consistency
|
||||
parsedURL.Path = strings.TrimSuffix(parsedURL.Path, "/")
|
||||
if parsedURL.Path == "" {
|
||||
parsedURL.Path = "/"
|
||||
}
|
||||
|
||||
return parsedURL.String()
|
||||
}
|
||||
|
||||
func (c *Crawler) isSameDomain(urlStr string) bool {
|
||||
parsedURL, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check if same host
|
||||
if parsedURL.Host != c.baseURL.Host {
|
||||
return false
|
||||
}
|
||||
|
||||
// Skip common non-HTML files
|
||||
path := strings.ToLower(parsedURL.Path)
|
||||
skipExtensions := []string{".pdf", ".jpg", ".jpeg", ".png", ".gif", ".css", ".js", ".xml", ".zip", ".tar", ".gz"}
|
||||
for _, ext := range skipExtensions {
|
||||
if strings.HasSuffix(path, ext) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (c *Crawler) calculatePriority(depth int) float64 {
|
||||
// Homepage gets highest priority
|
||||
if depth == 0 {
|
||||
return 1.0
|
||||
}
|
||||
// Decrease priority with depth
|
||||
priority := 1.0 - (float64(depth) * 0.2)
|
||||
if priority < 0.3 {
|
||||
priority = 0.3
|
||||
}
|
||||
return priority
|
||||
}
|
||||
|
||||
func (c *Crawler) sendEvent(eventType string, data interface{}) {
|
||||
if c.eventChan != nil {
|
||||
select {
|
||||
case c.eventChan <- models.Event{Type: eventType, Data: data}:
|
||||
default:
|
||||
// Channel full or closed, skip event
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user