This repository has been archived by the owner on Jun 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 601
/
Copy pathsoftwarelist_parser.py
115 lines (86 loc) · 2.9 KB
/
softwarelist_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
import csv as py_csv
import json as py_json
from pathlib import Path
from typing import List, Iterator
import click
import mistune
import unicodedata
from bs4 import BeautifulSoup
from bs4.element import Tag
HEADERS = [
'Supplier',
'Product',
'Version',
'Status CVE-2021-4104',
'Status CVE-2021-44228',
'Status CVE-2021-45046',
'Status CVE-2021-45105',
'Notes',
'Links'
]
def parse_links(links: List[Tag] = None) -> dict:
""" Get all links info from Links column
:return: Dictionary with `{link_text: link_href}`
"""
return {link.text: link.get('href') for link in links}
def parse_record(record: List[Tag] = None) -> dict:
""" Parse single tr record in Software list
:return: Dictionary of parsed cell from record
"""
result = dict()
link_index = len(HEADERS) - 1
for index, header in enumerate(HEADERS):
# Parse links differently
if index == link_index:
if len(record) == link_index:
result[header] = {}
else:
result[header] = parse_links(record[index].find_all('a'))
else:
# Ensure unicode in text is properly parsed
result[header] = unicodedata.normalize("NFKD", record[index].text)
return result
def parse_software_file(path: Path) -> Iterator[dict]:
""" Parse a single software list file
:param path: path of file to parse
:yield: a single parse record from file
"""
# Parse Markdown to HTML and get soup
with path.open('r') as f:
content = f.read()
html = mistune.html(content)
soup = BeautifulSoup(html, 'html.parser')
# Look for all tr columns with td fields, after first h3 header
for row in soup.find_all('tr'):
tds = row.find_all('td')
if tds: # ensure empty tr are ignored
yield parse_record(tds)
@click.group()
@click.option('--path', default='../../software/', help='Path to software list', type=click.Path(exists=True, path_type=Path))
@click.pass_context
def main(ctx, path):
records = list()
# Get list of software list files
software_lists = sorted(f for f in path.iterdir() if 'software_list_' in f.name)
for software_file in software_lists:
file_records = [r for r in parse_software_file(software_file)]
records += file_records
ctx.obj['records'] = records
@main.command()
@click.argument('output', default='-', type=click.File('w+'))
@click.pass_context
def json(ctx, output):
py_json.dump(ctx.obj['records'], output)
@main.command()
@click.argument('output', default='-', type=click.File('w+'))
@click.pass_context
def csv(ctx, output):
writer = py_csv.DictWriter(output, HEADERS)
writer.writeheader()
# cleanup links
for record in ctx.obj['records']:
record['Links'] = list(record['Links'].values())
writer.writerow(record)
if __name__ == '__main__':
main(obj={})