"""
不规则数据爬取
"""
import requests
from bs4 import BeautifulSoup
from fake_useragent import FakeUserAgent
Referer = 'https://www.ccdi.gov.cn/scdcn/'
url = 'https://www.ccdi.gov.cn/scdcn/zggb/djcf/'
# 1.设置headers随机代理和源页面
hd = {'User-Agent': FakeUserAgent().random, 'Referer': Referer}
# 2.向页面发起请求
r = requests.get(url, headers=hd)
# 3.转换成UTF-8编码格式
r.encoding = 'UTF-8'
# 4.通过BeautifulSoup将页面规整
soup = BeautifulSoup(r.text, 'html.parser')
li = soup.find_all('li')
list11 = []
list22 = []
for x in li:
bt = x.a.string
bt_new = bt.strip() # bt_new 每一条数据 时间除外
sj = x.span.string
sj_new = sj.strip() # 每一条数据的时间
list11.append(bt_new)
list22.append(sj_new)
# print(list11)
# print(list22)
info_all = []
index_now = 0
for i in list11:
diqu = ['市', '省', '区', '司']
names = ["孙", "胡", "李", "王", "曹", "宋", "张", "盛", "沈", "郝", "刘", "黄", "陈", "谢", "于"]
index1_list = []
index1_list2 = []
index1 = ''
address = '' # 地区
name = '' # 名字
zhiwei = '' # 职位
for j in diqu: # 省份
if i.find(diqu[0]) > -1:
index1 = i.find(j)
address = i[:index1 + 1]
elif i.find(j) > -1:
index1 = i.find(j)
address = i[:index1 + 1]
else:
index1 = 0
index1_list.append(index1)
name2 = i.find('严')
for n in names: # 名字
if i.find(n) > -1:
index2 = i.find(n)
name = i[index2:name2]
index1_list2.append(index2)
b = 0
for a in index1_list: # 职位
zhiwei = i[a + 1:index1_list2[b]]
b += 1
chufen = i.split('违法')[-1] # 处理结果
info_all.append({
'地区': address, '姓名': name, '职位': zhiwei, '处分': chufen, '时间': list22[index_now]
})
index_now += 1
print(info_all)
爬虫之非结构化数据爬取:字符串find,split应用
于 2022-12-06 21:08:37 首次发布