import urllib.request
import requests
from bs4 import BeautifulSoup
#from urllib.parse import quote
"""
爬取静态网页 https://www.ccf.org.cn/Academic_Evaluation/AI/
比较request 和 requests不同使用
以及中文字符乱码的处理
"""
def fetch_data_byrequest(url):
#req = requests.get(url)
#html = req.text
#return html
req=urllib.request.urlopen(url)
content=req.read().decode()
return content
def fetch_data_byrequests(url):
req = requests.get(url)
content = req.text.encode(req.encoding).decode(req.apparent_encoding)
return content
def parse_by_soap(content,selectors,split_char=""):
result =[]
soap = BeautifulSoup(content,'html.parser')
for sele in selectors:
all_blocks = soap.select(sele)
for b in all_blocks:
one_item=split_char.join(b.get_text().split())
result.append(one_item)
# for e in ele:
# #print(e)
# result.append(split_char.join(e.get_text().split()))
return result
def output2file(file_name,content):
f = open(file_name,"a",encoding="utf-8")
print(content,file=f)
f.close()
if __name__=="__main__":
url = "https://www.ccf.org.cn/Academic_Evaluation/AI/"
content_req = fetch_data_byrequest(url)
content_reqs=fetch_data_byrequests(url)
# parse file
selector_magzine_a="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(4)"
selector_magzin_b="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(6)"
selector_magzine_c="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(8)"
selector_conf_a="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(12)"
selector_conf_b="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(14)"
selector_conf_c="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(16)"
#selector_test="body > div.main.m-b-md > div.container > div.row-box > div > div.col-md-10 > div > div > ul:nth-child(4) > li"
#ele=soap.select(selector_test)
selectors=[selector_magzine_a+" > li",
selector_magzin_b+" > li",
selector_magzine_c+" > li",
selector_conf_a + " > li",
selector_conf_b + " > li",
selector_conf_c + " > li"]
split_char=';'
result=parse_by_soap(content_req,selectors,split_char)
for item in result:
output2file("result.txt",item)
# 输出到文件
#output2file("request_content.html",content_req)
#output2file("requests_content.html",content_reqs)
抓到的文件: