依旧是老师的作业,开源精神指引着我……
以下是Python2.7代码,2018.5.22运行通过
# coding=utf-8
author__ = 'Read Air'
import cookielib
import urllib2
import re
import csv
def saveHtml(file_name, file_content):
# 注意windows文件命名的禁用符,比如 /
with open(file_name.replace('/', '_') + ".html", "wb") as f:
# 写文件用bytes而不是str,所以要转码
f.write(file_content)
cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
opener.addheaders.append(('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; \
rv:11.0) like Gecko Core/1.63.5221.400 QQBrowser/10.0.1125.400'))
result_url = opener.open(
urllib2.Request(
'https://issues.apache.org/jira/browse/CAMEL-12525?jql=project%20%3D%20CAMEL%20AND%20resolution%20%3D%20Unresolved%20ORDER%20BY%20priority%20DESC%2C%20updated%20DESC'))
content = result_url.read()
print "Success in Web1!"
saveHtml("h", content)
re_law1 = '<div class=\"aui-group split-view\">(.*?)</div></div></div></div>'
key_data = re.findall(re_law1, content, re.S)
re_law2 = "<a class=\"splitview-issue-link\" data-issue-key=\"(.*?)\" href=\"(.*?)\"><img height=\"16\" width=\"16\" alt=\"(.*?)\" title=\"(.*?)\""
out = open("BUG List.csv", "a+")
csv_writer = csv.writer(out, dialect="excel")
csv_writer.writerow(["Number", "Description"])
index = 0
for i in re.findall(re_law2, key_data[0], re.S):
if i[2] == "Bug":
index += 1
print "find " + str(index) + " Bug(s)! named " + i[0]
result_url = opener.open(urllib2.Request('https://issues.apache.org' + i[1]))
content1 = result_url.read()
re_law3 = "<div class=\"user-content-block\">[\s\S]+? <p>(.*?)</p>"
# print re.findall(re_law3, content1, re.S)
saveHtml(i[0], content1)
out = open("BUG List.csv", "a+")
if re.findall(re_law3, content1, re.S):
description = re.findall(re_law3, content1, re.S)[0]
else:
description = "Not Found!"
csv_writer = csv.writer(out, dialect="excel")
csv_writer.writerow([i[0], description])
# print i[2]
# print i[2]
out.close()