update
This commit is contained in:
140
fetch_domain_zone.py
Normal file
140
fetch_domain_zone.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to fetch domain zone data from IANA root zone database
|
||||
"""
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import sys
|
||||
import re
|
||||
|
||||
def fetch_domain_zone_data():
|
||||
"""Fetch domain zone data from IANA root zone database"""
|
||||
try:
|
||||
# Add headers to mimic a browser
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
response = requests.get('https://www.iana.org/domains/root/db', headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Find the main table with domain data
|
||||
domains = []
|
||||
|
||||
# Look for table rows containing domain information
|
||||
# The IANA page typically has a table with domain, type, and manager information
|
||||
table = soup.find('table')
|
||||
if table:
|
||||
rows = table.find_all('tr')
|
||||
for row in rows[1:]: # Skip header row
|
||||
cells = row.find_all('td')
|
||||
if len(cells) >= 3:
|
||||
domain = cells[0].get_text(strip=True).lower()
|
||||
domain_type = cells[1].get_text(strip=True)
|
||||
tld_manager = cells[2].get_text(strip=True)
|
||||
|
||||
# Clean up the data
|
||||
domain = re.sub(r'[^a-z0-9.-]', '', domain)
|
||||
domain_type = re.sub(r'\s+', ' ', domain_type)
|
||||
tld_manager = re.sub(r'\s+', ' ', tld_manager)
|
||||
|
||||
if domain and domain != '.':
|
||||
domains.append({
|
||||
'domain': domain,
|
||||
'type': domain_type,
|
||||
'tld_manager': tld_manager
|
||||
})
|
||||
|
||||
# If table approach doesn't work, try alternative parsing
|
||||
if not domains:
|
||||
# Look for domain links in the page
|
||||
domain_links = soup.find_all('a', href=re.compile(r'/domains/root/db/'))
|
||||
for link in domain_links:
|
||||
domain_text = link.get_text(strip=True).lower()
|
||||
if domain_text and len(domain_text) > 1 and not domain_text.startswith('.'):
|
||||
# Try to get more info by following the link
|
||||
detail_url = f"https://www.iana.org{link['href']}"
|
||||
detail_data = fetch_domain_detail(detail_url)
|
||||
if detail_data:
|
||||
domains.append(detail_data)
|
||||
|
||||
print(f"Fetched {len(domains)} domains from IANA root zone database")
|
||||
return domains
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Error fetching IANA root zone data: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error parsing IANA data: {e}")
|
||||
return None
|
||||
|
||||
def fetch_domain_detail(detail_url):
|
||||
"""Fetch detailed information for a specific domain"""
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
response = requests.get(detail_url, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Extract domain name from URL or page
|
||||
domain = detail_url.split('/')[-1].lower()
|
||||
|
||||
# Look for type and manager information
|
||||
domain_type = "generic" # default
|
||||
tld_manager = "Unknown"
|
||||
|
||||
# Try to find type information
|
||||
type_elements = soup.find_all(text=re.compile(r'Type|Type of domain', re.IGNORECASE))
|
||||
for element in type_elements:
|
||||
parent = element.parent
|
||||
if parent:
|
||||
next_sibling = parent.find_next_sibling() or parent.find_next()
|
||||
if next_sibling:
|
||||
domain_type = next_sibling.get_text(strip=True)
|
||||
break
|
||||
|
||||
# Try to find manager information
|
||||
manager_elements = soup.find_all(text=re.compile(r'Manager|Sponsor|Registry', re.IGNORECASE))
|
||||
for element in manager_elements:
|
||||
parent = element.parent
|
||||
if parent:
|
||||
next_sibling = parent.find_next_sibling() or parent.find_next()
|
||||
if next_sibling:
|
||||
tld_manager = next_sibling.get_text(strip=True)
|
||||
break
|
||||
|
||||
return {
|
||||
'domain': domain,
|
||||
'type': domain_type,
|
||||
'tld_manager': tld_manager
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching detail for {detail_url}: {e}")
|
||||
return None
|
||||
|
||||
def main():
|
||||
print("Fetching IANA root zone database data...")
|
||||
|
||||
domains = fetch_domain_zone_data()
|
||||
|
||||
if domains:
|
||||
print(f"\nSample data:")
|
||||
for i, domain in enumerate(domains[:10]):
|
||||
print(f"{i+1}. {domain['domain']} - {domain['type']} - {domain['tld_manager']}")
|
||||
|
||||
print(f"\nTotal domains fetched: {len(domains)}")
|
||||
return domains
|
||||
else:
|
||||
print("Failed to fetch domain data")
|
||||
return None
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user