文本数据挖掘作业实验1 -----爬取数据

Mercury_cc

于 2022-10-28 22:57:00 发布

阅读量91

点赞数

本文链接：https://blog.csdn.net/Mercury_cc/article/details/130121496

版权

# 1.定位到来电分类分区

# 2. 提取子页面的连接地址 child_href1

# 3. 在子页面提取想要的数据

# 4. 再定位到详细来电，进入二重子页面

# 5. 提取二重子页面连接地址 child_href2

# 6. 在二重子页面(来电情况)里提取想要的数据

代码如下：

  1 # 1.定位到来电分类分区
  2 # 2. 提取子页面的连接地址 child_href1
  3 # 3. 在子页面提取想要的数据
  4 # 4. 再定位到详细来电，进入二重子页面
  5 # 5. 提取二重子页面连接地址 child_href2
  6 # 6. 在二重子页面(来电情况)里提取想要的数据
  7 # 7. 存放在data.json文件中
  8 
  9 
 10 import requests
 11 from lxml import etree
 12 import time
 13 import json
 14 
 15 # 1.定位到来电分类分区
 16 url = "http://12345.chengdu.gov.cn/moreTelByClass?TelType=1101"
 17 domain = "http://12345.chengdu.gov.cn/"
 18 domain1 = "http://12345.chengdu.gov.cn/"
 19 
 20 header = {
 21     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52"
 22 }
 23 
 24 # get请求
 25 resp = requests.get(url, headers=header)
 26 resp.encoding = "utf-8"
 27 # print(resp.text)
 28 
 29 html = etree.HTML(resp.text)
 30 
 31 child_href1_list = [] # 存放child_href1的列表
 32 child_href2_list = [] # 存放child_href2的列表
 33 # 创建空字典存放数据
 34 dicts = {}
 35 # 创建空列表 子页面1
 36 tel_titles = []
 37 handle_departs = []
 38 statuss = []
 39 types = []
 40 page_viewss = []
 41 tel_dates = []
 42 # 创建空列表 子页面2
 43 tel_conts = []
 44 proc_departss = []
 45 handle_results = []
 46 
 47 
 48 # child_href = html.xpath('//*[@id="container1"]/div/div[1]/div[1]/div[3]/div[1]/a/@href')[0]
 49 # child_href1 = domain + child_href
 50 # print(child_href1)
 51 
 52 
 53 # 2.提取子页面的连接地址 child_href1
 54 child_href1s = html.xpath('//*[@id="container1"]/div/div[1]/div[1]/div[3]/div')
 55 for href in child_href1s:
 56     child_href1 = domain + href.xpath("./a/@href")[0]
 57     # print(child_href1)
 58     child_href1_list.append(child_href1)
 59 
 60 # 3. 在子页面提取想要的数据
 61 for href1 in child_href1_list:
 62     child_resp1 = requests.get(href1, headers=header)
 63     child_resp1.encoding = 'utf-8'
 64     # print(child_resp1.text)
 65     html1 = etree.HTML(child_resp1.text) # 解析html
 66 
 67     # 分别将提取的数据存入相应列表
 68     tel_title = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[1]/text()')[0]
 69     tel_titles.append(tel_title)
 70 
 71     handle_depart = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[2]/text()')[0]
 72     handle_departs.append(handle_depart)
 73 
 74     status = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[3]/text()')[0]
 75     statuss.append(status)
 76 
 77     type_ = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[4]/text()')[0]
 78     types.append(type_)
 79 
 80     page_views = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[5]/text()')[0]
 81     page_viewss.append(page_views)
 82 
 83     tel_date = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[2]/a/div[6]/text()')[0]
 84     tel_dates.append(tel_date)
 85 
 86     # 向字典中增加键值对
 87     dicts['tel_title'] = tel_titles
 88     dicts['handle_depart'] = handle_departs
 89     dicts['status'] = statuss
 90     dicts['type_'] = types
 91     dicts['page_views'] = page_viewss
 92     dicts['tel_date'] = tel_dates
 93 
 94     # 4. 再定位到详细来电，进入二重子页面
 95     # 5. 提取二重子页面连接地址 child_href2
 96     child_href2s = html1.xpath('//*[@id="container1"]/div/div[2]/div[1]/ul/li[@class="f12px"]') # 筛选
 97     # print(child_href2s)
 98     for href_a in child_href2s:
 99         child_href2 = domain1 + href_a.xpath("./a/@href")[0]
100         # print(child_href2)
101         child_href2_list.append(child_href2)
102 
103     # 6. 在二重子页面(来电情况)里提取想要的数据
104     for href2 in child_href2_list:
105         child_resp2 = requests.get(href2, headers=header)
106         child_resp2.encoding = 'utf-8'
107         # print(child_resp2.text)
108         html2 = etree.HTML(child_resp2.text) # 解析html
109 
110         # 分别将提取的数据存入相应列表
111         tel_cont = html2.xpath('//*[@id="FmContent"]/text()')[0]
112         tel_conts.append(tel_cont)
113 
114         proc_departs = html2.xpath('//*[@id="container1"]/div[2]/table/tbody/tr[6]/td[2]/text()')[0]
115         proc_departss.append(proc_departs)
116 
117         handle_result = html2.xpath('//*[@id="DOverDesc"]/text()')
118         #print(handle_result)
119         handle_results.append(handle_result)
120 
121         # 向字典中增加键值对
122         dicts['tel_cont'] = tel_conts
123         dicts['proc_departs'] = proc_departss
124         dicts['handle_result'] = handle_results
125 
126 
127 # 保存数据为json格式
128 try:
129     with open('exp_data-2020083156.json', 'a', encoding="utf-8") as f:
130         f.write(json.dumps(dicts, ensure_ascii=False) + "\n")  # ensure_ascii=False，则返回值可以包含非ascii值
131 except IOError as e:
132     print(str(e))
133 
134 finally:
135     f.close()
136 
137 time.sleep(1)
138 print("over!")
139 resp.close()
140 child_resp1.close()

Mercury_cc

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
文本数据挖掘作业实验1 -----爬取数据

# 1.定位到来电分类分区# 2. 提取子页面的连接地址 child_href1# 3. 在子页面提取想要的数据# 4. 再定位到详细来电，进入二重子页面# 5. 提取二重子页面连接地址 child_href2# 6. 在二重子页面(来电情况)里提取想要的数据代码如下： 1 # 1.定位到来电分类分区 2 # 2. 提取子页面的连接地址 child_...
复制链接

扫一扫