update

2026-03-11 23:08:57 +05:30
parent 26e70981ee
commit 95aab950da
6 changed files with 892 additions and 1 deletions
--- a/fetch_domain_zone.py
+++ b/fetch_domain_zone.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""
+Script to fetch domain zone data from IANA root zone database
+"""
+
+import requests
+from bs4 import BeautifulSoup
+import sys
+import re
+
+def fetch_domain_zone_data():
+    """Fetch domain zone data from IANA root zone database"""
+    try:
+        # Add headers to mimic a browser
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        
+        response = requests.get('https://www.iana.org/domains/root/db', headers=headers)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'html.parser')
+        
+        # Find the main table with domain data
+        domains = []
+        
+        # Look for table rows containing domain information
+        # The IANA page typically has a table with domain, type, and manager information
+        table = soup.find('table')
+        if table:
+            rows = table.find_all('tr')
+            for row in rows[1:]:  # Skip header row
+                cells = row.find_all('td')
+                if len(cells) >= 3:
+                    domain = cells[0].get_text(strip=True).lower()
+                    domain_type = cells[1].get_text(strip=True)
+                    tld_manager = cells[2].get_text(strip=True)
+                    
+                    # Clean up the data
+                    domain = re.sub(r'[^a-z0-9.-]', '', domain)
+                    domain_type = re.sub(r'\s+', ' ', domain_type)
+                    tld_manager = re.sub(r'\s+', ' ', tld_manager)
+                    
+                    if domain and domain != '.':
+                        domains.append({
+                            'domain': domain,
+                            'type': domain_type,
+                            'tld_manager': tld_manager
+                        })
+        
+        # If table approach doesn't work, try alternative parsing
+        if not domains:
+            # Look for domain links in the page
+            domain_links = soup.find_all('a', href=re.compile(r'/domains/root/db/'))
+            for link in domain_links:
+                domain_text = link.get_text(strip=True).lower()
+                if domain_text and len(domain_text) > 1 and not domain_text.startswith('.'):
+                    # Try to get more info by following the link
+                    detail_url = f"https://www.iana.org{link['href']}"
+                    detail_data = fetch_domain_detail(detail_url)
+                    if detail_data:
+                        domains.append(detail_data)
+        
+        print(f"Fetched {len(domains)} domains from IANA root zone database")
+        return domains
+        
+    except requests.RequestException as e:
+        print(f"Error fetching IANA root zone data: {e}")
+        return None
+    except Exception as e:
+        print(f"Error parsing IANA data: {e}")
+        return None
+
+def fetch_domain_detail(detail_url):
+    """Fetch detailed information for a specific domain"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        
+        response = requests.get(detail_url, headers=headers)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'html.parser')
+        
+        # Extract domain name from URL or page
+        domain = detail_url.split('/')[-1].lower()
+        
+        # Look for type and manager information
+        domain_type = "generic"  # default
+        tld_manager = "Unknown"
+        
+        # Try to find type information
+        type_elements = soup.find_all(text=re.compile(r'Type|Type of domain', re.IGNORECASE))
+        for element in type_elements:
+            parent = element.parent
+            if parent:
+                next_sibling = parent.find_next_sibling() or parent.find_next()
+                if next_sibling:
+                    domain_type = next_sibling.get_text(strip=True)
+                    break
+        
+        # Try to find manager information
+        manager_elements = soup.find_all(text=re.compile(r'Manager|Sponsor|Registry', re.IGNORECASE))
+        for element in manager_elements:
+            parent = element.parent
+            if parent:
+                next_sibling = parent.find_next_sibling() or parent.find_next()
+                if next_sibling:
+                    tld_manager = next_sibling.get_text(strip=True)
+                    break
+        
+        return {
+            'domain': domain,
+            'type': domain_type,
+            'tld_manager': tld_manager
+        }
+        
+    except Exception as e:
+        print(f"Error fetching detail for {detail_url}: {e}")
+        return None
+
+def main():
+    print("Fetching IANA root zone database data...")
+    
+    domains = fetch_domain_zone_data()
+    
+    if domains:
+        print(f"\nSample data:")
+        for i, domain in enumerate(domains[:10]):
+            print(f"{i+1}. {domain['domain']} - {domain['type']} - {domain['tld_manager']}")
+        
+        print(f"\nTotal domains fetched: {len(domains)}")
+        return domains
+    else:
+        print("Failed to fetch domain data")
+        return None
+
+if __name__ == "__main__":
+    main()