一、所有城市链接
程序源码:
import requests
from lxml import etree
import os
# 发起 get 请求,获取响应内容
response = requests.get('http://www.bendibao.com/index.htm', headers={
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36'
})
# 将响应的页面字符串内容以 utf8 格式解码,并解析为 HTML 结构
html = etree.HTML(response.content.decode('utf8'))
# 如果文件存在,则删除后新建
file_path = "all_cites_links.txt"
if os.path.exists(file_path):
os.remove(file_path)
file = open(file_path, 'a')
total_result_count = 0
# 获得一组 div,包含若干个省份
divs = html.xpath("//div[@class='city-list']/dl")
for div in divs:
province = div.xpath("./dt/text()")
cities = div.xpath("./dd/a/text()")
city_urls = div.xpath("./dd/a/@href")
# 遍历城市名称列表和城市链接列表
for i in range(len(cities)):
total_result_count += 1
city_info = str(total_result_count) + "、省份:{:<10s}城市:{:<10s}链接:{}".format(province[0], cities[i], city_urls[i])
# 打印结果信息并保存到文件
print(city_info)
file.write(city_info + "\n")
file.close()
效果展示:
二、省会搜索链接
程序源码:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import requests
from lxml import etree
import os
# 打开浏览器,请求页面数据
options = webdriver.EdgeOptions()
options.add_argument("headless")
service = Service('D:\\Software\\WebDriver\\msedgedriver.exe')
driver = webdriver.Edge(service=service, options=options)
request_url = "http://www.bendibao.com/index.htm"
driver.get(request_url)
driver.maximize_window()
current_tab = driver.current_window_handle
# 发起请求,获取响应,获得相应内容
response = requests.get(request_url, headers={
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"
})
# 将响应的页面字符串内容以 utf8 格式解码,并解析为 HTML 结构
html = etree.HTML(response.content.decode('utf8'))
# 如果文件存在,则删除后新建
file_path = "province_search_link.txt"
if os.path.exists(file_path):
os.remove(file_path)
file = open(file_path, 'a')
total_result = 0
for province_index in range(1, 26):
# 获取当前省份和省会城市名称
province = html.xpath("//*[@id='city-list']/div/div/div[3]/dl[" + str(province_index) + "]/dt/text()")[0]
main_city = html.xpath("//*[@id='city-list']/div/div/div[3]/dl[" + str(province_index) + "]/dd/a[" + str(1) + "]/text()")[0]
# 点击省会城市,进入搜索界面
driver.find_element("xpath", "//*[@id='city-list']/div/div/div[3]/dl[" + str(province_index) + "]/dd/a[" + str(1) + "]").click()
# 切换到新打开的搜索界面
all_tabs = driver.window_handles
for tab in all_tabs:
if tab != current_tab:
driver.switch_to.window(tab)
next_tab = driver.current_window_handle
time.sleep(1)
# 在搜索框中输入关键字
driver.find_element("xpath", "//*[@id='header']/div[3]/form/div/input[2]").send_keys("最新落户条件及人才补贴政策")
# 点击搜索按钮,跳转到搜索具体界面
driver.find_element("xpath", "//*[@id='header']/div[3]/form/button").click()
# 切换到新打开的具体搜索界面
all_tabs = driver.window_handles
for tab in all_tabs:
if tab != current_tab and tab != next_tab:
driver.switch_to.window(tab)
time.sleep(1)
# 打印结果,并保存到文件
total_result += 1
city_name_search_str = str(total_result) + "、" + province + ":" + main_city + ": " + driver.current_url
print(city_name_search_str)
file.write(city_name_search_str + "\n")
file.flush()
# 关闭新打开的两个窗口,回到本地宝主界面,继续爬取下一个省会城市
driver.close()
driver.switch_to.window(next_tab)
driver.close()
driver.switch_to.window(current_tab)
time.sleep(1)
file.close()
效果展示:
三、省会政策列表
程序源码:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import requests
from lxml import etree
import os
import re
# 打开浏览器,请求页面数据
options = webdriver.EdgeOptions()
options.add_argument("headless")
service = Service('D:\\Software\\WebDriver\\msedgedriver.exe')
driver = webdriver.Edge(service=service, options=options)
request_url = "http://www.bendibao.com/index.htm"
driver.get(request_url)
driver.maximize_window()
current_tab = driver.current_window_handle
# 如果文件存在,则删除后新建
file_path = 'policy_content_link.txt'
if os.path.exists(file_path):
os.remove(file_path)
file = open(file_path, 'a')
total_result = 0
for province_index in range(1, 26):
# 点击省会城市,进入搜索界面
driver.find_element('xpath', "//*[@id='city-list']/div/div/div[3]/dl[" + str(province_index) + "]/dd/a[" + str(1) + "]").click()
# 切换到新打开的搜索界面
all_tabs = driver.window_handles
for tab in all_tabs:
if tab != current_tab:
driver.switch_to.window(tab)
tab_2 = driver.current_window_handle
time.sleep(1)
# 在搜索框中输入关键字
driver.find_element('xpath', "//*[@id='header']/div[3]/form/div/input[2]").send_keys("最新人才落户补贴政策")
# 点击搜索按钮,跳转到搜索具体界面
driver.find_element('xpath', "//*[@id='header']/div[3]/form/button").click()
# 切换到新打开的具体搜索界面
all_tabs = driver.window_handles
for tab in all_tabs:
if tab != current_tab and tab != tab_2:
driver.switch_to.window(tab)
time.sleep(1)
# 发起请求,获取具体搜索界面的内容
response = requests.get(driver.current_url, headers={
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"
})
html = etree.HTML(response.content.decode('utf8'))
# 获得一组标题和链接的 div
divs = html.xpath("//div[@class='result-list']/a[@class='result']")
for div in divs:
# 一个 div 下标题仍然是一个 list,遍历列表,拼接标题
titles = div.xpath("./div[@class='result-title']//text()")
website_title = ""
for i in range(len(titles)):
website_title = website_title + titles[i]
website_title = str(website_title)
# 判断标题是否符合要求
if website_title.find("人才") == -1:
continue
if website_title.find("落户") == -1:
continue
if website_title.find("政策") == -1:
continue
# 拼接结果
total_result += 1
websites = div.xpath("./@href")
website_title = str(total_result) + "、" + website_title + ":" + websites[0]
# 利用正则表达式的方式去除当前内容中的换行符、制表符、空格
website_title = re.sub(r"\s+", "", website_title).strip()
# 打印内容并保存到文件
print(website_title)
file.write(website_title + "\n")
file.flush()
# 关闭新打开的两个窗口,回到本地宝主界面,继续爬取下一个城市
driver.close()
driver.switch_to.window(tab_2)
driver.close()
driver.switch_to.window(current_tab)
time.sleep(1)
file.close()
效果展示:
四、武汉政策内容
程序源码:
import requests
from lxml import etree
import os
import re
# 发起请求,获取响应,获得相应内容
response = requests.get('http://wh.bendibao.com/live/202078/113158.shtm', headers={
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"
})
html = etree.HTML(response.content.decode('utf8'))
# 如果文件存在,则删除后新建
file_path = 'wuhan_policy_content.txt'
if os.path.exists(file_path):
os.remove(file_path)
file = open(file_path, 'a')
# 获取文章标题、发布时间、来源和导语等信息
title = html.xpath("//article[@id='news-article']/h1/text()")[0]
publish_time = html.xpath("//article[@id='news-article']//span[@class='public_time']/text()")[0]
introduction = html.xpath("//article[@id='news-article']//p[@class='dao']/text()")[0]
# 拼接结果信息并打印保存
info_str = "标题:" + title + "\n" + "时间:" + publish_time + "\n" + "导语:" + introduction + "\n\n"
print(info_str)
file.write(info_str)
# 获得具体内容列表
details_list = html.xpath("//article[@id='news-article']//div[@class='content-box']//text()")
for i in range(len(details_list)):
# 去除每条内容中的制表符、空格、换行符
details_list[i] = re.sub(r"\s+", "", details_list[i]).strip()
# 经处理后当前内容若为空则不打印不保存
if details_list[i] == "showtopcontent();" or details_list[i] == "":
continue
# 打印内容并保存到文件
print(details_list[i] + "\n")
file.write(details_list[i] + "\n\n")
# 关闭文件
file.close()
效果展示:
五、数据清洗&词云
程序源码:
import re
import collections
import jieba
import wordcloud
import matplotlib.pyplot as plt
# 以只读模式打开文件,读取内容
file = open('wuhan_policy_content.txt', 'r')
content = file.read()
file.close()
# 文本处理,定义正则表达式匹配模式,将符合模式的字符去除
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')
content = re.sub(pattern, '', content)
# 文本分词,精确模式分词
seg_list_exact = jieba.cut(content, cut_all=False)
# 自定义去除词库
remove_words = [u'的', u',', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。', u' ', u'、', u'中', u'在', u'了',
u'通常', u'如果', u'我们', u'需要', u'0', u'1', u'2', '3', '4', '5', '6', '7', '8', '9', '《', '》', '12']
# 循环读出每个分词,如果不在去除词库中,分词追加到列表
object_list = []
for word in seg_list_exact:
if word not in remove_words:
object_list.append(word)
# 词频统计
word_counts = collections.Counter(object_list)
# 获取前 10 个高频词
word_counts_top10 = word_counts.most_common(10)
print(word_counts_top10)
word_counts_top10 = str(word_counts_top10)
wc = wordcloud.WordCloud(
font_path='simfang.ttf',
max_words=55,
max_font_size=150,
background_color='white',
width=800, height=600,
)
# 从字典生成词云
wc.generate_from_frequencies(word_counts)
# 显示词云
plt.imshow(wc)
# 关闭坐标轴
plt.axis('off')
# 显示图像
plt.show()
wc.to_file('word_cloud.png')
效果展示: