import re
import json
import openpyxl
import re
import pandas as pd
import requests
import os
import csv
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
for i in range(21): #爬取第一到20页
url = "https://www.yanglaocn.com/policy/?pagetype=&page=i" #网址变化
s = requests.session() #获取请求
res = s.get(url, headers=headers)
page_content=(res.text) #获取网页内容
#用正则表达式解析
result = re.compile(r'<div class="PageList_P">.*?<a href="(?P<index1>.*?)"'
r'.*?title="(?P<index2>.*?)"'
r'.*?target="_blank">(?P<index3>.*?)</a>'
r'<label>(?P<index4>.*?)</label></div>',re.S)
result = result.finditer(page_content)
#存入csv文件
f=open("data2.csv",mode="a")
csvwriter=csv.writer(f)
for it in result:
dic=it.groupdict()
csvwriter.writerow(dic.values())
f.close()
养老信息网爬取
https://www.yanglaocn.com/policy/?pagetype=&page=1https://www.yanglaocn.com/policy/?pagetype=&page=1
最终结果