This commit is contained in:
Kar
2026-03-11 23:08:57 +05:30
parent 26e70981ee
commit 95aab950da
6 changed files with 892 additions and 1 deletions

119
idn_mappings.py Normal file
View File

@@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""
Script to map Punycode TLDs to their Unicode representations
"""
import idna
# Known IDN TLD mappings
IDN_MAPPINGS = {
'xn--p1ai': '.рф', # Russia
'xn--fiqs8s': '.中国', # China
'xn--fiqz9s': '.中國', # China (traditional)
'xn--lgbbat1ad8j': '.الجزائر', # Algeria
'xn--yfro4i67o': '.קום', # Israel (KOM)
'xn--4gbrim': '.مصر', # Egypt
'xn--55qx5d': '.موريتانيا', # Mauritania
'xn--80akhbyknj4f': '.հայ', # Armenia
'xn--80asehdb': '.бел', # Belarus
'xn--90a3ac': '.мкд', # Macedonia
'xn--45brj9c': '.бг', # Bulgaria
'xn--p1ai': '.рф', # Russia (duplicate)
'xn--hlcj6aya': '.سوريا', # Syria
'xn--mgbcpq6gpa1a': '.السعودية', # Saudi Arabia
'xn--ogbpf8fl': '.سودان', # Sudan
'xn--kprw13d': '.გე', # Georgia
'xn--kpry57d': '.გე', # Georgia (alternative)
'xn--o1ac': '.ελ', # Greece
'xn--80ao21a': '.қаз', # Kazakhstan
'xn--fgbp6a': '.مغرب', # Morocco
'xn--j1amh': '.укр', # Ukraine
'xn--mix891f': '.ไทย', # Thailand
'xn--mix082f': '.ไทย', # Thailand (alternative)
'xn--mxtq1m': '.新加坡', # Singapore
'xn--node': '.नेट', # India (NET)
'xn--j6w193g': '.香港', # Hong Kong
'xn--55qw42g': '.中国', # China (alternative)
'xn--5tzm5g': '.台灣', # Taiwan
'xn--6frz82g': '.ලංකා', # Sri Lanka
'xn--80adxhks': '.мкд', # Macedonia (alternative)
'xn--l1acc': '.мон', # Mongolia
'xn--9t4b11yi5a': '.இலங்கை', # Sri Lanka (alternative)
'xn--rhqv96g': '.世博', # Expo
'xn--0zwm56d': '.澳洲', # Australia
'xn--czru2d': '.कोम', # India (COM)
'xn--d1acj3b': '.дети', # Kids
'xn--d1alf': '.москва', # Moscow
'xn--h2brj9c': '.срб', # Serbia
'xn--h2breg3eve': '.срб', # Serbia (alternative)
'xn--k1x57d': '.新加坡', # Singapore (alternative)
'xn--mgbbh1a71e': '.امارات', # UAE
'xn--mgbaam7a8h': '.الاردن', # Jordan
'xn--mgbayh7gpa': '.الاردن', # Jordan (alternative)
'xn--y9a3aq': '.հայ', # Armenia (alternative)
'xn--mgbx4cd0ab': '.مليسيا', # Malaysia
'xn--54b7fta0cc': '.بھارت', # India
'xn--90ae5b': '.بازار', # Iran (Bazaar)
'xn--l1nej': '.موقع', # Iran (Site)
'xn--mgbgu82a': '.شبكة', # Iran (Network)
'xn--fiq64b': '.कॉम', # India (COM alternative)
'xn--kcrx77d1x4a': '.சிங்கப்பூர்', # Singapore (Tamil)
'xn--i1b6b1a6a2e': '.संगठन', # India (Organization)
'xn--nqv7f': '.فلسطين', # Palestine
'xn--qqh11a': '.مصر', # Egypt (alternative)
'xn--c1avg': '.бел', # Belarus (alternative)
'xn--e1a4c': '.ею', # European Union
'xn--8h0a': '.ايران', # Iran
'xn--1qqw23a': '.游戏', # China (Game)
'xn--3bst00m': '.公司', # China (Company)
'xn--45br5cyl': '.бг', # Bulgaria (alternative)
'xn--s9brj9c': '.срб', # Serbia (alternative)
'xn--czrs0t': '.कोम', # India (COM alternative)
'xn--czr694b': '.कॉम', # India (COM alternative)
'xn--gecrj9c': '.克罗地亚', # Croatia
'xn--p1ai': '.рф', # Russia (duplicate)
'xn--9krt00a': '.日本', # Japan
'xn--xkc2dl3a5ee0h': '.ಭಾರತ', # India (Kannada)
'xn--fzys8d69uvgm': '.تونس', # Tunisia
'xn--fzc2c9e2c': '.السعودية', # Saudi Arabia (alternative)
}
def punycode_to_unicode(punycode):
"""Convert Punycode to Unicode representation"""
try:
if punycode.startswith('xn--'):
return idna.decode(punycode)
return punycode
except:
return IDN_MAPPINGS.get(punycode, punycode)
def get_all_idn_tlds():
"""Get all IDN TLDs with their Unicode representations"""
import requests
response = requests.get('https://data.iana.org/TLD/tlds-alpha-by-domain.txt')
lines = response.text.strip().split('\n')
tlds = []
for line in lines:
line = line.strip()
if line and not line.startswith('#'):
tlds.append(line.lower())
idn_tlds = []
for tld in tlds:
if tld.startswith('xn--'):
unicode_form = punycode_to_unicode(tld)
idn_tlds.append({
'punycode': tld,
'unicode': unicode_form,
'display': f"{tld} ({unicode_form})"
})
return idn_tlds
if __name__ == "__main__":
idn_tlds = get_all_idn_tlds()
print(f"Found {len(idn_tlds)} IDN TLDs:")
for tld in idn_tlds[:20]:
print(f" {tld['display']}")