# 1.定位到来电分类分区
# 2. 提取子页面的连接地址 child_href1
# 3. 在子页面提取想要的数据
# 4. 再定位到详细来电,进入二重子页面
# 5. 提取二重子页面连接地址 child_href2
# 6. 在二重子页面(来电情况)里提取想要的数据
代码如下:
1 # 1.定位到来电分类分区
2 # 2. 提取子页面的连接地址 child_href1
3 # 3. 在子页面提取想要的数据
4 # 4. 再定位到详细来电,进入二重子页面
5 # 5. 提取二重子页面连接地址 child_href2
6 # 6. 在二重子页面(来电情况)里提取想要的数据
7 # 7. 存放在data.json文件中
8
9
10 import requests
11 from lxml import etree
12 import time
13 import json
14
15 # 1.定位到来电分类分区
16 url = "http://12345.chengdu.gov.cn/moreTelByClass?TelType=1101"
17 domain = "http://12345.chengdu.gov.cn/"
18 domain1 = "http://12345.chengdu.gov.cn/"
19
20 header = {
21 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52"
22 }
23
24 # get请求
25 resp = requests.get(url, headers=header)
26 resp.encoding = "utf-8"
27 # print(resp.text)
28
29 html = etree.HTML(resp.text)
30
31 child_href1_list = [] # 存放child_href1的列表
32 child_href2_list = [] # 存放child_href2的列表
33 # 创建空字典存放数据
34 dicts = {}
35 # 创建空列表 子页面1
36 tel_titles = []
37 handle_departs = []
38 statuss = []
39 types = []
40 page_viewss = []
41 tel_dates = []
42 # 创建空列表 子页面2
43 tel_conts = []
44 proc_departss = []
45 handle_results = []
46
47
48 # child_href = html.xpath('//*[@id="container1"]/div/div[1]/div[1]/div[3]/div[1]/a/@href')[0]
49 # child_href1 = domain + child_href
50 # print(child_href1)
51
52
53 # 2.提取子页面的连接地址 child_href1
54 child_href1s = html.xpath('//*[@id="container1"]/div/div[1]/div[1]/div[3]/div')
55 for href in child_href1s:
56 child_href1 = domain + href.xpath("./a/@href")[0]
57 # print(child_href1)
58 child_href1_list.append(child_href1)
59
60 # 3. 在子页面提取想要的数据
61 for href1 in child_href1_list:
62 child_resp1 = requests.get(href1, headers=header)
63 child_resp1.encoding = 'utf-8'
64 # print(child_resp1.text)
65 html1 = etree.HTML(child_resp1.text) # 解析html
66
67 # 分别将提取的数据存入相应列表
68 tel_title = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[1]/text()')[0]
69 tel_titles.append(tel_title)
70
71 handle_depart = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[2]/text()')[0]
72 handle_departs.append(handle_depart)
73
74 status = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[3]/text()')[0]
75 statuss.append(status)
76
77 type_ = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[4]/text()')[0]
78 types.append(type_)
79
80 page_views = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[5]/text()')[0]
81 page_viewss.append(page_views)
82
83 tel_date = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[6]/text()')[0]
84 tel_dates.append(tel_date)
85
86 # 向字典中增加键值对
87 dicts['tel_title'] = tel_titles
88 dicts['handle_depart'] = handle_departs
89 dicts['status'] = statuss
90 dicts['type_'] = types
91 dicts['page_views'] = page_viewss
92 dicts['tel_date'] = tel_dates
93
94 # 4. 再定位到详细来电,进入二重子页面
95 # 5. 提取二重子页面连接地址 child_href2
96 child_href2s = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[@class="f12px"]') # 筛选
97 # print(child_href2s)
98 for href_a in child_href2s:
99 child_href2 = domain1 + href_a.xpath("./a/@href")[0]
100 # print(child_href2)
101 child_href2_list.append(child_href2)
102
103 # 6. 在二重子页面(来电情况)里提取想要的数据
104 for href2 in child_href2_list:
105 child_resp2 = requests.get(href2, headers=header)
106 child_resp2.encoding = 'utf-8'
107 # print(child_resp2.text)
108 html2 = etree.HTML(child_resp2.text) # 解析html
109
110 # 分别将提取的数据存入相应列表
111 tel_cont = html2.xpath('//*[@id="FmContent"]/text()')[0]
112 tel_conts.append(tel_cont)
113
114 proc_departs = html2.xpath('//*[@id="container1"]/div[2]/table/tbody/tr[6]/td[2]/text()')[0]
115 proc_departss.append(proc_departs)
116
117 handle_result = html2.xpath('//*[@id="DOverDesc"]/text()')
118 #print(handle_result)
119 handle_results.append(handle_result)
120
121 # 向字典中增加键值对
122 dicts['tel_cont'] = tel_conts
123 dicts['proc_departs'] = proc_departss
124 dicts['handle_result'] = handle_results
125
126
127 # 保存数据为json格式
128 try:
129 with open('exp_data-2020083156.json', 'a', encoding="utf-8") as f:
130 f.write(json.dumps(dicts, ensure_ascii=False) + "\n") # ensure_ascii=False,则返回值可以包含非ascii值
131 except IOError as e:
132 print(str(e))
133
134 finally:
135 f.close()
136
137 time.sleep(1)
138 print("over!")
139 resp.close()
140 child_resp1.close()