# 解析html网页资源
def jiexi_html_files():
project_url_dict = {} #初始化字典
with open(files_dir_path, 'r') as f:
html_string = f.read()
html = etree.HTML(html_string)
project_list = html.xpath("//a[@class='ReviewCell_review_head_project_name__iy66Q']")
for item in project_list:
http_url = item.xpath("@href")[0]
project = item.xpath('text()')[0]
val = {project: http_url}
project_url_dict.update(val) # 向字典里跟新数据
files_dir = './'
files = os.listdir(files_dir)
for file in files:
if file.startswith('pqrs_project') :
with open(files_dir + file, 'r') as f:
print('当前处理文件:', file)
html_string = f.read()
html = etree.HTML(html_string)
web_url = '无'
website = html.xpath("//div[@class='OverviewSection_text_tile__3fjPH OverviewSection_protocol_website__1kGWZ']")
if len(website) == 1:
web_url = website[0].xpath("./a/text()")[0]
#print(web_url)
# 解析项目类别
class_dict = {}
class_list = html.xpath("//div[@class='AuditSummary_section_head__34n4V']")
for item in class_list:
class_name = item.xpath('./button/text()')[0].replace(' ', '').replace('\n', '').replace('\r', '')
class_value = item.xpath('./span/text()')[0]
class_value = '{}%'.format(class_value)
val = {class_name: class_value}
class_dict.update(val)
# 获取项目标签
# project = html.xpath("//span[@class='BreadcrumbItem_breadcrumb_item__2tV1B BreadcrumbItem_breadcrumb_item_active__3M3p6']")[0]
# project_symbol = project.xpath('text()')[0]
project_symbol = file.split('_')[3].split('.')[0]
project_pqr_url = project_url_dict.get(project_symbol)
if project_pqr_url is None:
project_pqr_url = '无'
# 获取项目评分
project = html.xpath("//span[@class='OverviewSection_final_score_value__1QwX4']")[0]
project_pqr_score = project.xpath('text()')[0]
project_pqr_score = '{}%'.format(project_pqr_score)
# 获取发布时间,发布版本
final_score_list = html.xpath("//div[@class='OverviewSection_text_tile__3fjPH']")
project_date = final_score_list[0].xpath('./span/text()')[1]
project_date = transform_month(project_date)
project_version = final_score_list[1].xpath('./span/text()')[2]
# 获取问题,项目问题回复
title = 'clumn1,clumn2,clumn3,clumn4,clumn5,clumn6,clumn7,clumn8,clumn9,clumn10,clumn11,clumn12,clumn13\n'
file_content = ''
flist = html.xpath("//div[contains(@class, 'AuditSummary_table_row__2Prwu')]")
for item in flist:
pro_question = item.xpath('./button/text()')[0].replace(' ', '').replace('\n', '').replace('\r', '').replace(',', '.')
pro_question_value = item.xpath('./span')[1].xpath('text()')[0]
#print(project_symbol, '项目地址:', web_url, '项目评分网址:', project_pqr_url, '项目总评分:', project_pqr_score, '项目版本:', project_version, '类别评分:', class_values, '项目发布日期:', project_date, '项目问答评分:', pro_question, pro_question_value)
addv = '{},{},{},{},{},{},{},{},{},{},{},{},{}'\
.format(project_symbol, web_url, project_pqr_url, project_pqr_score, project_version, class_dict.get('Code And Team'), class_dict.get('Documentation'), class_dict.get('Testing'), class_dict.get('Security'), class_dict.get('Access Controls'), project_date, pro_question, pro_question_value)
file_content = file_content + addv + '\n'
with open(output_path+project_symbol+'.csv', "w", encoding="utf-8") as f:
f.write(title)
f.write(file_content)
1)查找包含class属性=‘AuditSummary_table_row__2Prwu'’的所有div标签
flist = html.xpath("//div[contains(@class, 'AuditSummary_table_row__2Prwu')]")
2)python字典跟新,字典初始化: val = {project: http_url}
project_url_dict.update(val) # 向字典里跟新数据
3)python字典获取: project_pqr_url = project_url_dict.get(project_symbol)
if project_pqr_url is None:
project_pqr_url = '无'
4)xpath解析要点:
详见网址: