-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathget_e212_wiki.py
321 lines (272 loc) · 10.2 KB
/
get_e212_wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
import dataclasses
import os
import re
import tempfile
import time
import typing
from collections import defaultdict
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from pathlib import Path
from pprint import pformat
import requests
from bs4 import BeautifulSoup
def get_wiki_page(url: str) -> Path:
"""Download a page.
Stores the downloaded page in the OS temp dir, for up to 5 days, to allow
for faster iteration when improving the script.
"""
tmp_dir = Path(tempfile.gettempdir())
outfile = tmp_dir / Path(os.path.basename(url))
now = time.time()
max_age = timedelta(days=5) / timedelta(seconds=1)
if outfile.exists() and outfile.stat().st_mtime > now - max_age:
print(f"{outfile} exists and is recent")
return outfile
print(f"downloading {url} to {outfile}")
with open(outfile, "bw+") as fout:
page_resp = requests.get(url)
fout.write(page_resp.content)
return outfile
OPERATOR_NAME_CLEANER = re.compile(r"\s+(\[.+\])?$")
COUNTRY_HEADER = re.compile(r"[A-Z]{2}-[A-Z]{2}|[A-Z]{2}")
# Some countries share MCC, and since the dataset groups by MCC, it can make
# it difficult to figure out the correct country for those MCCs.
# This provides a small override table to correct for those cases.
OVERRIDES = {
# Guadaloupe is grouped with other French Overseas territories
("340", "01"): "GP",
("340", "02"): "GP",
("340", "03"): "GP",
("340", "08"): "GP",
("340", "09"): "GP",
("340", "20"): "GP",
# The Former Netherlands Antilles is still grouped in one table
("362", "31"): "BQ",
("362", "33"): "CW",
("362", "51"): "SX",
("362", "54"): "CW",
("362", "59"): "BQ",
("362", "60"): "BQ",
("362", "63"): "CW",
("362", "68"): "CW",
("362", "69"): "CW",
("362", "74"): "CW",
("362", "76"): "BQ",
("362", "78"): "BQ",
("362", "91"): "CW",
("362", "94"): "CW",
# Israeli MCC+MNCs used by a Palestinian operator
("425", "05"): "PS",
("425", "06"): "PS",
# Australia is grouped with Christmas and Cook Islands
("505", "01"): "AU",
("505", "02"): "AU",
("505", "03"): "AU",
("505", "04"): "AU",
("505", "07"): "AU",
("505", "10"): "AU",
("505", "11"): "AU",
("505", "13"): "AU",
("505", "14"): "AU",
("505", "16"): "AU",
("505", "17"): "AU",
("505", "18"): "AU",
("505", "19"): "AU",
("505", "20"): "AU",
("505", "21"): "AU",
("505", "22"): "AU",
("505", "23"): "AU",
("505", "24"): "AU",
("505", "25"): "AU",
("505", "26"): "AU",
("505", "27"): "AU",
("505", "28"): "AU",
("505", "30"): "AU",
("505", "31"): "AU",
("505", "32"): "AU",
("505", "33"): "AU",
("505", "34"): "AU",
("505", "35"): "AU",
("505", "36"): "AU",
("505", "37"): "AU",
("505", "38"): "AU",
("505", "39"): "AU",
("505", "40"): "AU",
("505", "41"): "AU",
("505", "42"): "AU",
("505", "43"): "AU",
("505", "44"): "AU",
("505", "45"): "AU",
("505", "46"): "AU",
("505", "47"): "AU",
("505", "48"): "AU",
("505", "49"): "AU",
("505", "50"): "AU",
("505", "51"): "AU",
("505", "52"): "AU",
("505", "53"): "AU",
("505", "54"): "AU",
("505", "61"): "AU",
("505", "62"): "AU",
("505", "68"): "AU",
("505", "71"): "AU",
("505", "72"): "AU",
("505", "88"): "AU",
("505", "90"): "AU",
# Reunion and Mayotte is also grouped
("647", "00"): "RE",
("647", "01"): "YT",
("647", "02"): "YT",
("647", "03"): "RE",
("647", "04"): "RE",
("647", "10"): "RE",
}
# Some countries have MCCs without any MNCs allocated, since we base the data
# on the list of MNCs we need a way to include additional MCCs.
EXTRA_COUNTRY_MCC = {
"406": "IN",
"316": "US",
"332": "VI",
"430": "AE",
"431": "AE",
"461": "CN",
}
@dataclasses.dataclass
class OperatorEntry:
"""Container for scraped information about network operators.
The basic building block we use to build our datasets from.
"""
mcc: str
mnc: str
country_code: str
name: str
def extract_page(path: Path) -> typing.Iterator[OperatorEntry]:
"""Yield MCC, MNC, country code and operator name entries from a page.
Scrapes the wikipedia page that mostly follows a rigid structure.
"""
with open(path, encoding="utf8") as fin:
soup = BeautifulSoup(fin.read(), features="lxml")
for country_header in soup.find_all("h4"):
possible_countries = COUNTRY_HEADER.findall(country_header.text.strip())
table_rows = country_header.find_next_sibling("table").find_all("tr")
for row in table_rows:
columns = row.find_all("td")
if not columns:
continue
# Skip networks that are not in operation anymore.
if columns[4].text.strip() == "Not operational":
continue
mcc = columns[0].text.strip()
if len(mcc) != 3:
mcc = mcc[-3:]
mnc = columns[1].text.strip()
# Some MNC entries on the wiki pages, define ranges with a hyphen
# for now we can just skip them, but perhaps in time we should
# expand them to the full ranges.
if "-" in mnc:
continue
# Some MNC entries on the wiki pages, have unknown MNCs, which is
# _problematic_ to say the least.
if mnc == "?":
continue
# Due to the structure of the wiki dataset, we have no direct
# mapping between MCC+MNC and country codes, only potential country
# codes. If we have more than one potential country code, but no
# rule for what the country should be in this case, raise an error.
if len(possible_countries) != 1 and (mcc, mnc) not in OVERRIDES:
raise ValueError(
f"missing override for: {mcc}:{mnc}, options: {possible_countries}"
)
if (mcc, mnc) in OVERRIDES:
country_code = OVERRIDES[(mcc, mnc)]
else:
country_code = possible_countries[0]
# The wikipedia data contains some entries that have subregion
# country codes, this would perhaps be reasonable to handle, but
# including these would be a breaking change.
if len(country_code) != 2:
continue
operator_name = OPERATOR_NAME_CLEANER.sub("", columns[3].text)
yield OperatorEntry(
mcc=mcc,
mnc=mnc,
country_code=country_code,
name=operator_name,
)
def main() -> None:
"""Extract information from a bunch of wiki pages to build the datasets from.
While wiki is imperfect, the mappings between the datasets are not perfect
either as some MCCs belong to government entities that are disputed, and some
MCCs are shared by multiple countries.
This makes a simple general mapping impossible and the code has to be able
to handle a bunch of exceptions and just try to do a best effort mapping.
"""
# pylint: disable=too-many-locals
files = [
"https://en.wikipedia.org/wiki/Mobile_network_codes_in_ITU_region_2xx_(Europe)",
"https://en.wikipedia.org/wiki/Mobile_Network_Codes_in_ITU_region_3xx_(North_America)",
"https://en.wikipedia.org/wiki/Mobile_network_codes_in_ITU_region_4xx_(Asia)",
"https://en.wikipedia.org/wiki/Mobile_Network_Codes_in_ITU_region_5xx_(Oceania)",
"https://en.wikipedia.org/wiki/Mobile_Network_Codes_in_ITU_region_6xx_(Africa)",
"https://en.wikipedia.org/wiki/Mobile_Network_Codes_in_ITU_region_7xx_(South_America)",
]
paths = [get_wiki_page(filename) for filename in files]
now = datetime.now(timezone.utc)
extracted_pages = (values for path in paths for values in extract_page(path))
operator_by_plmn = defaultdict(dict)
for operator in extracted_pages:
operator_by_plmn[int(operator.mcc)][int(operator.mnc)] = (
operator.country_code,
operator.name,
)
operator_by_plmn = dict(operator_by_plmn)
plmn_by_countries = defaultdict(list)
for mcc, mncs in operator_by_plmn.items():
for mnc, country in mncs.items():
plmn_by_countries[country[0]].append((mcc, mnc))
plmn_by_countries = dict(plmn_by_countries)
generated_file_header = "\n".join(
[
"# Generated by get_e212_wiki.py",
f"# at {now.isoformat()}",
"#",
"# Based on the following pages from wikipedia:",
*[f"# {page}" for page in files],
"#",
"# Do not make any manual edits to this file.",
"# Edits will be overwritten when the generator script runs.",
"import typing",
"",
]
)
with open("phone_iso3166/e212_names.py", "w", encoding="utf8") as fout:
fout.write("# pylint: disable=too-many-lines\n")
fout.write(generated_file_header)
fout.write(
f"\noperators: typing.Dict[int, typing.Dict[int, typing.Tuple[str, str]]] = {pformat(operator_by_plmn)}\n"
)
fout.write(
f"\n\ncountries: typing.Dict[str, typing.List[typing.Tuple[int, int]]] = {pformat(plmn_by_countries)}\n"
)
fout.write("\n")
def reduce_operators(
operators: typing.Dict[int, typing.Tuple[str, str]],
) -> typing.Union[str, typing.Dict[int, str]]:
countries = {each[0] for each in operators.values()}
if len(countries) == 1:
return countries.pop()
return {mnc: operator[0] for mnc, operator in operators.items()}
plmns_to_country = {}
for mcc, operator in operator_by_plmn.items():
plmns_to_country[mcc] = reduce_operators(operator)
for mcc, country_code in EXTRA_COUNTRY_MCC.items():
plmns_to_country[int(mcc)] = country_code
with open("phone_iso3166/e212.py", "w", encoding="utf8") as fout:
fout.write(generated_file_header)
fout.write(
f"\nnetworks: typing.Dict[int, typing.Union[str, typing.Dict[int, str]]] = {pformat(plmns_to_country)}\n"
)
if __name__ == "__main__":
main()