From 26e70981ee986a476790bf0d960804e2bf9df33a Mon Sep 17 00:00:00 2001 From: "Kar@k5" Date: Wed, 11 Mar 2026 21:18:24 +0530 Subject: [PATCH] init --- .gitignore | 4 + README.md | 147 +++++++++++++++++ add_removed_column.sql | 11 ++ create_separate_tables.sql | 17 ++ populate_separate_tables.py | 206 ++++++++++++++++++++++++ requirements.txt | 2 + update.py | 308 ++++++++++++++++++++++++++++++++++++ 7 files changed, 695 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 add_removed_column.sql create mode 100644 create_separate_tables.sql create mode 100644 populate_separate_tables.py create mode 100644 requirements.txt create mode 100644 update.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c35de6f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +venv/ +__pycache__/ +*.pyc +*.log diff --git a/README.md b/README.md new file mode 100644 index 0000000..68348b8 --- /dev/null +++ b/README.md @@ -0,0 +1,147 @@ +# Internet Domain Database + +This project maintains two separate tables for internet domain data: +- `domain_root`: Top-Level Domains (TLDs) from IANA +- `domain_suffix`: Public Suffix List from Mozilla + +## Database Connection + +- **Host**: l2 +- **Port**: 3306 +- **User**: root +- **Database**: sp_spider + +## Table Structure + +### domain_root +Contains IANA TLD data with unique root domains and soft delete functionality. +```sql +CREATE TABLE domain_root ( + id INT AUTO_INCREMENT PRIMARY KEY, + root VARCHAR(63) NOT NULL UNIQUE, + removed BOOLEAN DEFAULT FALSE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_root (root), + INDEX idx_removed (removed) +); +``` + +### domain_suffix +Contains Public Suffix List data with unique suffixes and soft delete functionality. +```sql +CREATE TABLE domain_suffix ( + id INT AUTO_INCREMENT PRIMARY KEY, + suffix VARCHAR(255) NOT NULL UNIQUE, + removed BOOLEAN DEFAULT FALSE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_suffix (suffix), + INDEX idx_removed (removed) +); +``` + +## Setup + +1. Install dependencies: +```bash +pip install -r requirements.txt +``` + +2. Create the tables: +```bash +mariadb -h l2 -u root -p0000 --ssl=FALSE sp_spider < create_separate_tables.sql +``` + +3. Add the removed column (for existing installations): +```bash +mariadb -h l2 -u root -p0000 --ssl=FALSE sp_spider < add_removed_column.sql +``` + +4. Populate the tables: +```bash +python populate_separate_tables.py 0000 +``` + +5. Update tables with soft delete functionality: +```bash +python update.py 0000 +``` + +## Data Sources + +- **TLD Data**: https://data.iana.org/TLD/tlds-alpha-by-domain.txt + - Contains official Top-Level Domains + - Currently: 1,436 TLDs + +- **Public Suffix List**: https://publicsuffix.org/list/public_suffix_list.dat + - Contains public domain suffixes including TLDs, ccTLDs, and private domain suffixes + - Currently: 10,067 suffixes + +## Usage + +Query active TLDs: +```sql +SELECT * FROM domain_root WHERE removed = FALSE AND root = 'com'; +``` + +Query removed TLDs: +```sql +SELECT * FROM domain_root WHERE removed = TRUE; +``` + +Query active suffixes: +```sql +SELECT * FROM domain_suffix WHERE removed = FALSE AND suffix LIKE '%.com'; +``` + +Query removed suffixes: +```sql +SELECT * FROM domain_suffix WHERE removed = TRUE; +``` + +Get statistics: +```sql +SELECT + 'domain_root' as table_name, + COUNT(*) as total, + SUM(CASE WHEN removed = FALSE THEN 1 ELSE 0 END) as active, + SUM(CASE WHEN removed = TRUE THEN 1 ELSE 0 END) as removed +FROM domain_root +UNION ALL +SELECT + 'domain_suffix' as table_name, + COUNT(*) as total, + SUM(CASE WHEN removed = FALSE THEN 1 ELSE 0 END) as active, + SUM(CASE WHEN removed = TRUE THEN 1 ELSE 0 END) as removed +FROM domain_suffix; +``` + +## Project Files + +- `create_separate_tables.sql` - Table creation script +- `add_removed_column.sql` - Script to add removed column for soft delete functionality +- `populate_separate_tables.py` - Initial data population script +- `update.py` - Update script with soft delete functionality +- `requirements.txt` - Python dependencies +- `README.md` - This documentation + +## Soft Delete Functionality + +The `update.py` script provides soft delete functionality that: + +1. **Marks entries as removed**: Sets `removed = TRUE` for entries no longer found in source data +2. **Adds new entries**: Inserts new entries from source with `removed = FALSE` +3. **Restores entries**: Sets `removed = FALSE` for previously removed entries that reappear in source +4. **Provides statistics**: Shows counts of active, removed, new, and restored entries + +### Update Process + +The update script: +- Fetches latest data from IANA TLD and Public Suffix List +- Compares with current database entries +- Performs batch updates for efficiency +- Handles duplicate entries gracefully with `INSERT IGNORE` +- Updates `updated_at` timestamp for all changes + +Run the update script periodically to keep the database synchronized with source data. diff --git a/add_removed_column.sql b/add_removed_column.sql new file mode 100644 index 0000000..6bc47c7 --- /dev/null +++ b/add_removed_column.sql @@ -0,0 +1,11 @@ +-- Add removed column to domain_root table +ALTER TABLE domain_root +ADD COLUMN removed BOOLEAN DEFAULT FALSE; + +-- Add removed column to domain_suffix table +ALTER TABLE domain_suffix +ADD COLUMN removed BOOLEAN DEFAULT FALSE; + +-- Add index for removed column for better query performance +ALTER TABLE domain_root ADD INDEX idx_removed (removed); +ALTER TABLE domain_suffix ADD INDEX idx_removed (removed); diff --git a/create_separate_tables.sql b/create_separate_tables.sql new file mode 100644 index 0000000..c4f3d02 --- /dev/null +++ b/create_separate_tables.sql @@ -0,0 +1,17 @@ +-- Create domain_root table for IANA TLD data +CREATE TABLE IF NOT EXISTS domain_root ( + id INT AUTO_INCREMENT PRIMARY KEY, + root VARCHAR(63) NOT NULL UNIQUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_root (root) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- Create domain_suffix table for Public Suffix List data +CREATE TABLE IF NOT EXISTS domain_suffix ( + id INT AUTO_INCREMENT PRIMARY KEY, + suffix VARCHAR(255) NOT NULL UNIQUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_suffix (suffix) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; diff --git a/populate_separate_tables.py b/populate_separate_tables.py new file mode 100644 index 0000000..3688a8d --- /dev/null +++ b/populate_separate_tables.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +""" +Script to populate domain_root and domain_suffix tables separately +""" + +import mysql.connector +import requests +import sys +from datetime import datetime + +# Database connection configuration +DB_CONFIG = { + 'host': 'l2', + 'port': 3306, + 'user': 'root', + 'password': None, # Will be set from command line or input + 'database': 'sp_spider', + 'charset': 'utf8mb4', + 'ssl_disabled': True, + 'auth_plugin': 'mysql_native_password' +} + +# URLs for data sources +IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt' +PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat' + +def fetch_tld_data(): + """Fetch TLD data from IANA""" + try: + response = requests.get(IANA_TLD_URL) + response.raise_for_status() + + lines = response.text.strip().split('\n') + tlds = [] + + for line in lines: + line = line.strip() + if line and not line.startswith('#'): + tlds.append(line.lower()) + + return tlds + except requests.RequestException as e: + print(f"Error fetching TLD data: {e}") + return None + +def fetch_psl_data(): + """Fetch Public Suffix List data""" + try: + response = requests.get(PSL_URL) + response.raise_for_status() + + lines = response.text.strip().split('\n') + suffixes = [] + + for line in lines: + line = line.strip() + if line and not line.startswith('//'): + suffixes.append(line.lower()) + + return suffixes + except requests.RequestException as e: + print(f"Error fetching PSL data: {e}") + return None + +def populate_domain_root(tlds): + """Populate domain_root table with TLD data""" + try: + conn = mysql.connector.connect(**DB_CONFIG) + cursor = conn.cursor() + + insert_query = "INSERT IGNORE INTO domain_root (root) VALUES (%s)" + + batch_size = 100 + inserted_count = 0 + + for i in range(0, len(tlds), batch_size): + batch = tlds[i:i + batch_size] + data = [(tld,) for tld in batch] + cursor.executemany(insert_query, data) + inserted_count += cursor.rowcount + conn.commit() + print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} TLDs") + + print(f"Successfully inserted {inserted_count} TLDs into domain_root table") + + # Get total count + cursor.execute("SELECT COUNT(*) FROM domain_root") + total_count = cursor.fetchone()[0] + print(f"Total TLDs in domain_root table: {total_count}") + + return True + + except mysql.connector.Error as e: + print(f"Database error in domain_root: {e}") + return False + finally: + if 'conn' in locals() and conn.is_connected(): + cursor.close() + conn.close() + +def populate_domain_suffix(suffixes): + """Populate domain_suffix table with PSL data""" + try: + conn = mysql.connector.connect(**DB_CONFIG) + cursor = conn.cursor() + + insert_query = "INSERT IGNORE INTO domain_suffix (suffix) VALUES (%s)" + + batch_size = 100 + inserted_count = 0 + + for i in range(0, len(suffixes), batch_size): + batch = suffixes[i:i + batch_size] + data = [(suffix,) for suffix in batch] + cursor.executemany(insert_query, data) + inserted_count += cursor.rowcount + conn.commit() + print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} suffixes") + + print(f"Successfully inserted {inserted_count} suffixes into domain_suffix table") + + # Get total count + cursor.execute("SELECT COUNT(*) FROM domain_suffix") + total_count = cursor.fetchone()[0] + print(f"Total suffixes in domain_suffix table: {total_count}") + + return True + + except mysql.connector.Error as e: + print(f"Database error in domain_suffix: {e}") + return False + finally: + if 'conn' in locals() and conn.is_connected(): + cursor.close() + conn.close() + +def show_sample_data(): + """Show sample data from both tables""" + try: + conn = mysql.connector.connect(**DB_CONFIG) + cursor = conn.cursor() + + print("\n=== Sample data from domain_root table ===") + cursor.execute("SELECT id, root, created_at FROM domain_root ORDER BY id LIMIT 10") + for row in cursor.fetchall(): + print(f"{row[0]} {row[1]} {row[2]}") + + print("\n=== Sample data from domain_suffix table ===") + cursor.execute("SELECT id, suffix, created_at FROM domain_suffix ORDER BY id LIMIT 10") + for row in cursor.fetchall(): + print(f"{row[0]} {row[1]} {row[2]}") + + cursor.close() + conn.close() + + except mysql.connector.Error as e: + print(f"Database error: {e}") + +def main(): + import getpass + + # Get password from command line argument or prompt + if len(sys.argv) > 1: + password = sys.argv[1] + else: + password = getpass.getpass("Enter MariaDB password for user 'root': ") + + DB_CONFIG['password'] = password + + print("Starting separate tables population process...") + + # Fetch TLD data + print(f"\nFetching TLD data from: {IANA_TLD_URL}") + tlds = fetch_tld_data() + if not tlds: + print("Failed to fetch TLD data") + sys.exit(1) + print(f"Fetched {len(tlds)} TLDs") + + # Fetch PSL data + print(f"\nFetching PSL data from: {PSL_URL}") + suffixes = fetch_psl_data() + if not suffixes: + print("Failed to fetch PSL data") + sys.exit(1) + print(f"Fetched {len(suffixes)} suffixes") + + # Populate domain_root table + print(f"\nPopulating domain_root table...") + if not populate_domain_root(tlds): + print("Failed to populate domain_root table") + sys.exit(1) + + # Populate domain_suffix table + print(f"\nPopulating domain_suffix table...") + if not populate_domain_suffix(suffixes): + print("Failed to populate domain_suffix table") + sys.exit(1) + + # Show sample data + show_sample_data() + + print("\n=== Tables population completed successfully ===") + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..304218b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +mysql-connector-python==8.2.0 +requests==2.31.0 diff --git a/update.py b/update.py new file mode 100644 index 0000000..2b20fb6 --- /dev/null +++ b/update.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +""" +Script to update domain_root and domain_suffix tables with soft delete functionality +""" + +import mysql.connector +import requests +import sys +from datetime import datetime + +# Database connection configuration +DB_CONFIG = { + 'host': 'l2', + 'port': 3306, + 'user': 'root', + 'password': None, # Will be set from command line or input + 'database': 'sp_spider', + 'charset': 'utf8mb4', + 'ssl_disabled': True, + 'auth_plugin': 'mysql_native_password' +} + +# URLs for data sources +IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt' +PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat' + +def fetch_tld_data(): + """Fetch TLD data from IANA""" + try: + response = requests.get(IANA_TLD_URL) + response.raise_for_status() + + lines = response.text.strip().split('\n') + tlds = [] + + for line in lines: + line = line.strip() + if line and not line.startswith('#'): + tlds.append(line.lower()) + + return tlds + except requests.RequestException as e: + print(f"Error fetching TLD data: {e}") + return None + +def fetch_psl_data(): + """Fetch Public Suffix List data""" + try: + response = requests.get(PSL_URL) + response.raise_for_status() + + lines = response.text.strip().split('\n') + suffixes = [] + + for line in lines: + line = line.strip() + if line and not line.startswith('//'): + suffixes.append(line.lower()) + + return suffixes + except requests.RequestException as e: + print(f"Error fetching PSL data: {e}") + return None + +def update_domain_root(tlds): + """Update domain_root table with soft delete and new entries""" + try: + conn = mysql.connector.connect(**DB_CONFIG) + cursor = conn.cursor() + + # Get current entries in database + cursor.execute("SELECT id, root FROM domain_root WHERE removed = FALSE") + current_entries = {row[1]: row[0] for row in cursor.fetchall()} + + # Convert tlds to set for faster lookup + tld_set = set(tlds) + current_tlds = set(current_entries.keys()) + + # Mark entries as removed if not in source + removed_tlds = current_tlds - tld_set + if removed_tlds: + print(f"Marking {len(removed_tlds)} TLDs as removed") + for tld in removed_tlds: + cursor.execute( + "UPDATE domain_root SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE root = %s", + (tld,) + ) + + # Add new entries + new_tlds = tld_set - current_tlds + if new_tlds: + print(f"Adding {len(new_tlds)} new TLDs") + insert_query = "INSERT IGNORE INTO domain_root (root, removed) VALUES (%s, FALSE)" + batch_size = 100 + + for i in range(0, len(new_tlds), batch_size): + batch = list(new_tlds)[i:i + batch_size] + data = [(tld,) for tld in batch] + cursor.executemany(insert_query, data) + conn.commit() + print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} new TLDs") + + # Restore entries that were previously removed but now exist in source + restored_tlds = current_tlds & tld_set + cursor.execute("SELECT root FROM domain_root WHERE removed = TRUE AND root IN (%s)" % + ','.join(['%s'] * len(restored_tlds)), list(restored_tlds)) + to_restore = [row[0] for row in cursor.fetchall()] + + if to_restore: + print(f"Restoring {len(to_restore)} previously removed TLDs") + for tld in to_restore: + cursor.execute( + "UPDATE domain_root SET removed = FALSE, updated_at = CURRENT_TIMESTAMP WHERE root = %s", + (tld,) + ) + + # Update updated_at timestamp for all active entries that still exist in source + verified_active = current_tlds & tld_set + if verified_active: + print(f"Updating timestamps for {len(verified_active)} verified active TLDs") + cursor.execute( + "UPDATE domain_root SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND root IN (%s)" % + ','.join(['%s'] * len(verified_active)), list(verified_active) + ) + + conn.commit() + + # Show statistics + cursor.execute("SELECT COUNT(*) FROM domain_root WHERE removed = FALSE") + active_count = cursor.fetchone()[0] + cursor.execute("SELECT COUNT(*) FROM domain_root WHERE removed = TRUE") + removed_count = cursor.fetchone()[0] + + print(f"domain_root update completed:") + print(f" Active entries: {active_count}") + print(f" Removed entries: {removed_count}") + print(f" New entries added: {len(new_tlds)}") + print(f" Entries marked as removed: {len(removed_tlds)}") + print(f" Entries restored: {len(to_restore)}") + + return True + + except mysql.connector.Error as e: + print(f"Database error in domain_root: {e}") + return False + finally: + if 'conn' in locals() and conn.is_connected(): + cursor.close() + conn.close() + +def update_domain_suffix(suffixes): + """Update domain_suffix table with soft delete and new entries""" + try: + conn = mysql.connector.connect(**DB_CONFIG) + cursor = conn.cursor() + + # Get current entries in database + cursor.execute("SELECT id, suffix FROM domain_suffix WHERE removed = FALSE") + current_entries = {row[1]: row[0] for row in cursor.fetchall()} + + # Convert suffixes to set for faster lookup + suffix_set = set(suffixes) + current_suffixes = set(current_entries.keys()) + + # Mark entries as removed if not in source + removed_suffixes = current_suffixes - suffix_set + if removed_suffixes: + print(f"Marking {len(removed_suffixes)} suffixes as removed") + for suffix in removed_suffixes: + cursor.execute( + "UPDATE domain_suffix SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE suffix = %s", + (suffix,) + ) + + # Add new entries + new_suffixes = suffix_set - current_suffixes + if new_suffixes: + print(f"Adding {len(new_suffixes)} new suffixes") + insert_query = "INSERT IGNORE INTO domain_suffix (suffix, removed) VALUES (%s, FALSE)" + batch_size = 100 + + for i in range(0, len(new_suffixes), batch_size): + batch = list(new_suffixes)[i:i + batch_size] + data = [(suffix,) for suffix in batch] + cursor.executemany(insert_query, data) + conn.commit() + print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} new suffixes") + + # Restore entries that were previously removed but now exist in source + restored_suffixes = current_suffixes & suffix_set + cursor.execute("SELECT suffix FROM domain_suffix WHERE removed = TRUE AND suffix IN (%s)" % + ','.join(['%s'] * len(restored_suffixes)), list(restored_suffixes)) + to_restore = [row[0] for row in cursor.fetchall()] + + if to_restore: + print(f"Restoring {len(to_restore)} previously removed suffixes") + for suffix in to_restore: + cursor.execute( + "UPDATE domain_suffix SET removed = FALSE, updated_at = CURRENT_TIMESTAMP WHERE suffix = %s", + (suffix,) + ) + + # Update updated_at timestamp for all active entries that still exist in source + verified_active = current_suffixes & suffix_set + if verified_active: + print(f"Updating timestamps for {len(verified_active)} verified active suffixes") + cursor.execute( + "UPDATE domain_suffix SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND suffix IN (%s)" % + ','.join(['%s'] * len(verified_active)), list(verified_active) + ) + + conn.commit() + + # Show statistics + cursor.execute("SELECT COUNT(*) FROM domain_suffix WHERE removed = FALSE") + active_count = cursor.fetchone()[0] + cursor.execute("SELECT COUNT(*) FROM domain_suffix WHERE removed = TRUE") + removed_count = cursor.fetchone()[0] + + print(f"domain_suffix update completed:") + print(f" Active entries: {active_count}") + print(f" Removed entries: {removed_count}") + print(f" New entries added: {len(new_suffixes)}") + print(f" Entries marked as removed: {len(removed_suffixes)}") + print(f" Entries restored: {len(to_restore)}") + + return True + + except mysql.connector.Error as e: + print(f"Database error in domain_suffix: {e}") + return False + finally: + if 'conn' in locals() and conn.is_connected(): + cursor.close() + conn.close() + +def show_sample_data(): + """Show sample data from both tables""" + try: + conn = mysql.connector.connect(**DB_CONFIG) + cursor = conn.cursor() + + print("\n=== Sample data from domain_root table ===") + cursor.execute("SELECT id, root, removed, created_at FROM domain_root ORDER BY id LIMIT 10") + for row in cursor.fetchall(): + status = "REMOVED" if row[2] else "ACTIVE" + print(f"{row[0]} {row[1]} [{status}] {row[3]}") + + print("\n=== Sample data from domain_suffix table ===") + cursor.execute("SELECT id, suffix, removed, created_at FROM domain_suffix ORDER BY id LIMIT 10") + for row in cursor.fetchall(): + status = "REMOVED" if row[2] else "ACTIVE" + print(f"{row[0]} {row[1]} [{status}] {row[3]}") + + cursor.close() + conn.close() + + except mysql.connector.Error as e: + print(f"Database error: {e}") + +def main(): + import getpass + + # Get password from command line argument or prompt + if len(sys.argv) > 1: + password = sys.argv[1] + else: + password = getpass.getpass("Enter MariaDB password for user 'root': ") + + DB_CONFIG['password'] = password + + print("Starting domain tables update process with soft delete functionality...") + + # Fetch TLD data + print(f"\nFetching TLD data from: {IANA_TLD_URL}") + tlds = fetch_tld_data() + if not tlds: + print("Failed to fetch TLD data") + sys.exit(1) + print(f"Fetched {len(tlds)} TLDs") + + # Fetch PSL data + print(f"\nFetching PSL data from: {PSL_URL}") + suffixes = fetch_psl_data() + if not suffixes: + print("Failed to fetch PSL data") + sys.exit(1) + print(f"Fetched {len(suffixes)} suffixes") + + # Update domain_root table + print(f"\nUpdating domain_root table...") + if not update_domain_root(tlds): + print("Failed to update domain_root table") + sys.exit(1) + + # Update domain_suffix table + print(f"\nUpdating domain_suffix table...") + if not update_domain_suffix(suffixes): + print("Failed to update domain_suffix table") + sys.exit(1) + + # Show sample data + show_sample_data() + + print("\n=== Tables update completed successfully ===") + +if __name__ == "__main__": + main()