# Used to standardize the names of countries into their three-letter ISO 3166 codes
# Notes:
# * Some older country names are used for compatiability with older data (eg: East Timor vs Timor-Leste)
# * Exact match doesn't need regex flags at it uses pl.when(pl.col(match_column) == f"(?i){key}")
# * Fuzzy match uses pl.when(pl.col(match_column).str.contains(f"(?i){key}")) 

exact_match = {
	'Europe': None,
	'Guinea': 'GIN',                   # prevent match with PNG, GNB, and GNQ
	'hospital': None,
	'Ireland': 'IRL',                  # prevent match with Northern Ireland
	'Korea': 'KOR',                    # prevent match with North Korea
	'Mali': 'MLI',                     # prevent match with Somalia
	'Mexico': 'MEX',                   # prevent match with New Mexico
	'Niger': 'NER',                    # prevent match with Nigeria
	'patient': None,
	'Republic of Congo': 'COG',        # prevent match with COD
	'Republic of the Congo': 'COG',    # prevent match with COD
	'Samoa': 'WSM',                    # prevent match with ASM
	'Sudan': 'SDN',                    # prevent match with South Sudan
	'The Congo': None,                 # ambigious
	'uncalculated': None,
	'United States': 'USA',            # prevent match with VIR/UMI
	'USA': 'USA',                      # idk guys there's probably a fake match somewhere
	'USA: Texas': 'USA',               # TODO: handle this as a region so as not to mess with Texas

	# common shorthands
	'DPRK': 'PRK',
	'GB': 'GBR',
	'IRE': 'IRL',
	'PRC': 'CHN',
	'DRC': 'COD',
	'UK': 'GBR',
	'US': 'USA', 

	# common typos
	'Argentia': 'ARG',
	'Ethopia': 'ETH',
	'Marocco': 'MAR'
}

countries_to_continents = {
	# not exhaustive
	'AFG': 'Asia',
	'ALB': 'Europe',
	'ARG': 'South America',
	'ARM': 'Asia',
	'AUS': 'Oceania',
	'AUT': 'Europe',
	'AZE': 'Asia', # per NCBI standard
	'BEL': 'Europe',
	'BEN': 'Africa',
	'BFA': 'Africa',
	'BGD': 'Asia',
	'BGR': 'Europe',
	'BIH': 'Europe',
	'BLR': 'Europe',
	'BRA': 'South America',
	'BWA': 'Africa',
	'CAN': 'North America',
	'CHE': 'Europe',
	'CHN': 'Asia',
	'CIV': 'Africa',
	'CMR': 'Africa',
	'COD': 'Africa',
	'COG': 'Africa',
	'COL': 'South America',
	'COM': 'Africa',
	'CZE': 'Europe',
	'DEU': 'Europe',
	'DJI': 'Africa',
	'DNK': 'Europe',
	'DOM': 'North America',
	'DZA': 'Africa',
	'ECU': 'South America',
	'ERI': 'Africa',
	'ESP': 'Europe',
	'EST': 'Europe',
	'ETH': 'Africa',
	'FIN': 'Europe',
	'FRA': 'Europe',
	'GAB': 'Africa',
	'GBR': 'Europe',
	'GEO': 'Asia', # per NCBI standard
	'GHA': 'Africa',
	'GIN': 'Africa',
	'GMB': 'Africa',
	'GNB': 'Africa',
	'GNQ': 'Africa',
	'GRC': 'Europe',
	'GRL': 'North America',
	'GTM': 'North America',
	'HKG': 'Asia',
	'HND': 'North America',
	'HRV': 'Europe',
	'HUN': 'Europe',
	'IDN': 'Asia',
	'IND': 'Asia',
	'IRL': 'Europe',
	'IRN': 'Asia',
	'ISR': 'Asia',
	'ITA': 'Europe',
	'JPN': 'Asia',
	'KAZ': 'Asia',
	'KEN': 'Africa',
	'KGZ': 'Asia',
	'KHM': 'Asia',
	'KIR': 'Oceania',
	'KOR': 'Asia',
	'LBN': 'Asia',
	'LBR': 'Africa',
	'LKA': 'Asia',
	'LTY': 'Europe',
	'LVA': 'Europe',
	'MAR': 'Africa',
	'MDA': 'Europe',
	'MDG': 'Africa',
	'MEX': 'North America',
	'MKD': 'Europe',
	'MLI': 'Africa',
	'MMR': 'Asia',
	'MNE': 'Europe',
	'MNG': 'Asia',
	'MOZ': 'Africa',
	'MRT': 'Africa',
	'MWI': 'Africa',
	'MYS': 'Asia',
	'NAM': 'Africa',
	'NER': 'Africa',
	'NGA': 'Africa',
	'NLD': 'Europe',
	'NOR': 'Europe',
	'NPL': 'Asia',
	'NZL': 'Oceania',
	'OMN': 'Asia',
	'PAK': 'Asia',
	'PAN': 'North America', # per NCBI standard
	'PER': 'South America',
	'PHL': 'Asia',
	'PAL': 'Asia',
	'PNG': 'Oceania',
	'POL': 'Europe',
	'PRK': 'Asia',
	'PRT': 'Europe',
	'PRY': 'South America',
	'ROU': 'Europe',
	'RUS': 'Europe', # per NCBI standard
	'RWA': 'Africa',
	'SAU': 'Asia',
	'SDN': 'Africa',
	'SEN': 'Africa',
	'SGP': 'Asia',
	'SLE': 'Africa',
	'SLK': 'Europe',
	'SOM': 'Africa',
	'SRB': 'Europe',
	'SVN': 'Europe',
	'SWE': 'Europe',
	'SWZ': 'Africa',
	'SYR': 'Asia',
	'THA': 'Asia',
	'TJK': 'Asia',
	'TKM': 'Asia',
	'TLS': 'Asia',
	'TUN': 'Africa',
	'TWN': 'Asia',
	'TZA': 'Africa',
	'UGA': 'Africa',
	'UKR': 'Europe',
	'USA': 'North America',
	'UZB': 'Asia',
	'VEN': 'South America',
	'VNM': 'Asia',
	'XKX': 'Europe',
	'ZAF': 'Africa',
	'ZMB': 'Africa',
	'ZWE': 'Africa',
}

# The apostrophes have won. I'm just going to substring match on Ivory and Ivoire
#exact_match_ofarrells_wrath = {
#	#
#	#      apostrophes
#	#   CIV    🤝     IRL
#	#
#	"Cote D Ivoire": 'CIV',
#	"Cote d''Ivoire": 'CIV',
#	"Cote d'Ivoire": 'CIV',  # inconsistent matching, TODO check if polars bug or skill issue on my part
#	"Cote d\'Ivoire": 'CIV', # this doesn't seem to help
#	"Côte d'Ivoire": 'CIV',  # with o circumflex (ASCII 212)
#	"Republic of Côte d'Ivoire": 'CIV',
#	"Cote d_Ivoire": 'CIV',
#	"Ivory Coast": 'CIV',
#	"IVORY_COAST": 'CIV'
#}

substring_match = {
	'Afghanistan': 'AFG',
	'Albania': 'ALB',
	'Algeria': 'DZA',
	'Angola': 'AGO',
	'Argentina': 'ARG',
	'Armenia': 'ARM',
	'Aruba': 'ABW',
	'Australia': 'AUS',
	'Austria': 'AUT',
	'Azerbaijan': 'AZE',
	'Bangladesh': 'BGD',
	'Belarus': 'BLR',
	'Belgium': 'BEL',
	'Benin': 'BEN',
	'Bhutan': 'BTN',
	'Blood': None,  # SAMN02585006
	'Bosnia and Herzegovina': 'BIH',
	'Botswana': 'BWA',
	'Brazil': 'BRA',
	'Britain': 'GBR', # substring matches Great Britain, Kingdom of Great Britain, etc
	'British Virgin Islands': 'VGB',
	'Bulgaria': 'BGR',
	'Burkina Faso': 'BFA',
	'Burma': 'MMR',
	'Burundi': 'BDI',
	'Cambodia': 'KHM',
	'Cameroon': 'CMR',
	'Canada': 'CAN',
	'Cape Verde': 'CPV',
	'Central African Republic': 'CAF',
	'Chile': 'CHL',
	'China': 'CHN',
	'Colombia': 'COL',
	'Comoros': 'COM',
	'Costa Rica': 'CRI',
	'Croatia': 'HRV',
	'Czech Republic': 'CZE',
	'Czechia': 'CZE',
	'Democratic Republic of the Congo': 'COD',
	'Denmark': 'DNK',
	'Djibouti': 'DJI',
	'Dominican Republic': 'DOM',
	'East Timor': 'TLS',
	'Ecuador': 'ECU',
	'Egypt': 'EGY',
	'El Salvador': 'SLV',
	'Equatorial Guinea': 'GNQ',
	'Eritrea': 'ERI',
	'Estonia': 'EST',
	'Eswatini': 'SWZ',
	'Ethiopia': 'ETH',
	'Finland': 'FIN',
	'France': 'FRA',
	'Gabon': 'GAB',
	'Gambia': 'GMB',
	'Georgia': 'GEO',
	'Germany': 'DEU',
	'Ghana': 'GHA',
	'Gibraltar': 'GIB',
	'Greece': 'GRC',
	'Greenland': 'GRL',
	'Guadeloupe': 'GLP',
	'Guam': 'GUM',
	'Guatemala': 'GTM',
	'Guinea-Bissau': 'GNB',
	'Haiti': 'HTI',
	'Honduras': 'HND',
	'Hong Kong': 'HKG',
	'Hungary': 'HUN',
	'India': 'IND',
	'Indonesia': 'IDN',
	'Iran': 'IRN',
	'Iraq': 'IRQ',
	'Ireland': 'IRL',
	'Israel': 'ISR',
	'Italy': 'ITA',
	'Ivoire': 'CIV', # workaround for O'Farrell's wrath
	'Ivory': 'CIV', # workaround for O'Farrell's wrath
	'Jamaica': 'JAM',
	'Japan': 'JPN',
	'Kazakhstan': 'KAZ',
	'Kenya': 'KEN',
	'Kiribati': 'KIR',
	'Kosovo': 'XKX', # unofficial but widely used
	'Kuwait': 'KWT',
	'Kyrgyzstan': 'KGZ',
	'Laos': 'LAO',
	'Latvia': 'LVA',
	'Lebanon': 'LBN',
	'Liberia': 'LBR',
	'Libya': 'LBY',
	'Lithuania': 'LTY',
	'Macedonia': 'MKD', # substring matches North Macedonia, former Yugoslav Republic of Macedonia, etc
	'Madagascar': 'MDG',
	'Malawi': 'MWI',
	'Malaysia': 'MYS',
	'Malta': 'MLT',
	'Marshall Islands': 'MHL',
	'Martinique': 'MTQ',
	'Mauritania': 'MRT',
	'Mayotte': 'MYT',
	'Moldova': 'MDA',
	'Mongolia': 'MNG',
	'Montenegro': 'MNE',
	'Morocco': 'MAR',
	'Mozambique': 'MOZ',
	'Myanmar': 'MMR',
	'Namibia': 'NAM',
	'Nepal': 'NPL',
	'Netherlands': 'NLD', # substring matches The Netherlands
	'New Caledonia': 'NCL',
	'New Zealand': 'NZL',
	'Nigeria': 'NGA',
	'North Korea': 'PRK',
	'North Macedonia': 'MKD',
	'Northern Ireland': 'GBR',
	'Northern Mariana Islands': 'MNP',
	'Norway': 'NOR',
	'Oman': 'OMN',
	'Pakistan': 'PAK',
	'Palau': 'PLW',
	'Palestine': 'PAL',
	'Panama': 'PAN',
	'Papua New Guinea': 'PNG',
	'Paraguay': 'PRY',
	'Peru': 'PER',
	'Philippines': 'PHL', # substring matches The Philipines
	'Poland': 'POL',
	'Portugal': 'PRT',
	'Romania': 'ROU',
	'Russia': 'RUS', # substring matches Russian Federation
	'Rwanda': 'RWA',
	'Saudi Arabia': 'SAU',
	'Senegal': 'SEN',
	'Serbia': 'SRB',
	'Sierra Leone': 'SLE',
	'SierraLeone': 'SLE',
	'Singapore': 'SGP',
	'Slovakia': 'SLK',
	'Slovenia': 'SVN',
	'Somalia': 'SOM',
	'South Korea': 'KOR',
	'South Sudan': 'SSD', # Sudan is an exact match
	'Spain': 'ESP',
	'Sri Lanka': 'LKA',
	'Suriname': 'SUR',
	'Swaziland': 'SWZ',
	'Sweden': 'SWE',
	'Switzerland': 'CHE',
	'Syria': 'SYR', # substring matches Syrian Arab Republic
	'Taiwan': 'TWN',
	'Tajikistan': 'TJK',
	'Tanzania': 'TZA',
	'Thailand': 'THA',
	'The former Yugoslav Republic of Macedonia': 'MKD', # allows pulling region from SRR9614686 and the like
	'The Gambia': 'GMB',
	'Timor Leste': 'TLS',
	'Timor-Leste': 'TLS',
	'Togo': 'TGO',
	'Tunisia': 'TUN',
	'Turkey': 'TUR',
	'Turkiye': 'TUR',
	'Turkmenistan': 'TKM',
	'Türkiye': 'TUR',
	'Uganda': 'UGA',
	'Ukraine': 'UKR',
	'United Kingdom': 'GBR',
	'United States Minor Outlying Islands': 'UMI',
	'United States of America': 'USA',
	'United States Virgin Islands': 'VIR',
	'Uruguay': 'URY',
	'Uzbekistan': 'UZB',
	'Venezuela': 'VEN',
	'Viet Nam': 'VNM',
	'Vietnam': 'VNM',
	'Western Samoa': 'WSM',
	'Yemen': 'YEM',
	'Zambia': 'ZMB',
	'Zimbabwe': 'ZWE',

	# done last to avoid improper matches to the general continent region of southern africa
	'South Africa': 'ZAF'
}