import requests
import re
from urllib import parse
# step 1: 获取网页内容
def getHtml(url):
try:
req_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
r = requests.get(url, headers=req_headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except EOFError:
print("fail")
# step 2: 利用正则表达式从网页中提取关键词
# 例如: 爬取省份的别名
def craw_baidubaike(province_name):
ori_url = "https://baike.baidu.com/item/xxx"
with open("craw_results.txt", "w", encoding="utf-8") as file:
# -- windows系统需要进行编码转换
# province_parse = parse.urlencode({"wd": province_name})[3:] # url = ori_url.replace("xxx", province_parse)
url = ori_url.replace("xxx", province_name)
html = getHtml(url)
pattern_page_format = re.compile('名