[软件需求工程]本地宝小爬虫

春天熊

已于 2023-05-31 18:21:06 修改

阅读量2.4k

点赞数 5

分类专栏：武理四年文章标签： python spider bendibao whut

于 2021-12-14 10:30:57 首次发布

本文链接：https://blog.csdn.net/weixin_51008866/article/details/121021483

版权

武理四年专栏收录该内容

38 篇文章 120 订阅

订阅专栏

Spring-_-Bear 的 CSDN 博客导航

一、所有城市链接

程序源码：

import requests
from lxml import etree
import os

# 发起 get 请求，获取响应内容
response = requests.get('http://www.bendibao.com/index.htm', headers={
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36'
})

# 将响应的页面字符串内容以 utf8 格式解码，并解析为 HTML 结构
html = etree.HTML(response.content.decode('utf8'))

# 如果文件存在，则删除后新建
file_path = "all_cites_links.txt"
if os.path.exists(file_path):
    os.remove(file_path)
file = open(file_path, 'a')

total_result_count = 0
# 获得一组 div，包含若干个省份
divs = html.xpath("//div[@class='city-list']/dl")
for div in divs:
    province = div.xpath("./dt/text()")
    cities = div.xpath("./dd/a/text()")
    city_urls = div.xpath("./dd/a/@href")
    # 遍历城市名称列表和城市链接列表
    for i in range(len(cities)):
        total_result_count += 1
        city_info = str(total_result_count) + "、省份:{:<10s}城市:{:<10s}链接:{}".format(province[0], cities[i], city_urls[i])
        # 打印结果信息并保存到文件
        print(city_info)
        file.write(city_info + "\n")

file.close()

效果展示：

在这里插入图片描述

二、省会搜索链接

程序源码：

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import requests
from lxml import etree
import os

# 打开浏览器，请求页面数据
options = webdriver.EdgeOptions()
options.add_argument("headless")
service = Service('D:\\Software\\WebDriver\\msedgedriver.exe')
driver = webdriver.Edge(service=service, options=options)
request_url = "http://www.bendibao.com/index.htm"
driver.get(request_url)
driver.maximize_window()
current_tab = driver.current_window_handle

# 发起请求，获取响应，获得相应内容
response = requests.get(request_url, headers={
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"
})
# 将响应的页面字符串内容以 utf8 格式解码，并解析为 HTML 结构
html = etree.HTML(response.content.decode('utf8'))

# 如果文件存在，则删除后新建
file_path = "province_search_link.txt"
if os.path.exists(file_path):
    os.remove(file_path)
file = open(file_path, 'a')

total_result = 0
for province_index in range(1, 26):
    # 获取当前省份和省会城市名称
    province = html.xpath("//*[@id='city-list']/div/div/div[3]/dl[" + str(province_index) + "]/dt/text()")[0]
    main_city = html.xpath("//*[@id='city-list']/div/div/div[3]/dl[" + str(province_index) + "]/dd/a[" + str(1) + "]/text()")[0]

    # 点击省会城市，进入搜索界面
    driver.find_element("xpath", "//*[@id='city-list']/div/div/div[3]/dl[" + str(province_index) + "]/dd/a[" + str(1) + "]").click()
    # 切换到新打开的搜索界面
    all_tabs = driver.window_handles
    for tab in all_tabs:
        if tab != current_tab:
            driver.switch_to.window(tab)
    next_tab = driver.current_window_handle
    time.sleep(1)

    # 在搜索框中输入关键字
    driver.find_element("xpath", "//*[@id='header']/div[3]/form/div/input[2]").send_keys("最新落户条件及人才补贴政策")
    # 点击搜索按钮，跳转到搜索具体界面
    driver.find_element("xpath", "//*[@id='header']/div[3]/form/button").click()
    # 切换到新打开的具体搜索界面
    all_tabs = driver.window_handles
    for tab in all_tabs:
        if tab != current_tab and tab != next_tab:
            driver.switch_to.window(tab)
    time.sleep(1)

    # 打印结果，并保存到文件
    total_result += 1
    city_name_search_str = str(total_result) + "、" + province + "：" + main_city + ": " + driver.current_url
    print(city_name_search_str)
    file.write(city_name_search_str + "\n")
    file.flush()

    # 关闭新打开的两个窗口，回到本地宝主界面，继续爬取下一个省会城市
    driver.close()
    driver.switch_to.window(next_tab)
    driver.close()
    driver.switch_to.window(current_tab)
    time.sleep(1)

file.close()

效果展示：

在这里插入图片描述

三、省会政策列表

程序源码：

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import requests
from lxml import etree
import os
import re

# 打开浏览器，请求页面数据
options = webdriver.EdgeOptions()
options.add_argument("headless")
service = Service('D:\\Software\\WebDriver\\msedgedriver.exe')
driver = webdriver.Edge(service=service, options=options)
request_url = "http://www.bendibao.com/index.htm"
driver.get(request_url)
driver.maximize_window()
current_tab = driver.current_window_handle

# 如果文件存在，则删除后新建
file_path = 'policy_content_link.txt'
if os.path.exists(file_path):
    os.remove(file_path)
file = open(file_path, 'a')

total_result = 0
for province_index in range(1, 26):
    # 点击省会城市，进入搜索界面
    driver.find_element('xpath', "//*[@id='city-list']/div/div/div[3]/dl[" + str(province_index) + "]/dd/a[" + str(1) + "]").click()
    # 切换到新打开的搜索界面
    all_tabs = driver.window_handles
    for tab in all_tabs:
        if tab != current_tab:
            driver.switch_to.window(tab)
    tab_2 = driver.current_window_handle
    time.sleep(1)

    # 在搜索框中输入关键字
    driver.find_element('xpath', "//*[@id='header']/div[3]/form/div/input[2]").send_keys("最新人才落户补贴政策")
    # 点击搜索按钮，跳转到搜索具体界面
    driver.find_element('xpath', "//*[@id='header']/div[3]/form/button").click()
    # 切换到新打开的具体搜索界面
    all_tabs = driver.window_handles
    for tab in all_tabs:
        if tab != current_tab and tab != tab_2:
            driver.switch_to.window(tab)
    time.sleep(1)

    # 发起请求，获取具体搜索界面的内容
    response = requests.get(driver.current_url, headers={
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"
    })
    html = etree.HTML(response.content.decode('utf8'))
    # 获得一组标题和链接的 div
    divs = html.xpath("//div[@class='result-list']/a[@class='result']")

    for div in divs:
        # 一个 div 下标题仍然是一个 list，遍历列表，拼接标题
        titles = div.xpath("./div[@class='result-title']//text()")
        website_title = ""
        for i in range(len(titles)):
            website_title = website_title + titles[i]
        website_title = str(website_title)

        # 判断标题是否符合要求
        if website_title.find("人才") == -1:
            continue
        if website_title.find("落户") == -1:
            continue
        if website_title.find("政策") == -1:
            continue

        # 拼接结果
        total_result += 1
        websites = div.xpath("./@href")
        website_title = str(total_result) + "、" + website_title + "：" + websites[0]
        # 利用正则表达式的方式去除当前内容中的换行符、制表符、空格
        website_title = re.sub(r"\s+", "", website_title).strip()

        # 打印内容并保存到文件
        print(website_title)
        file.write(website_title + "\n")
        file.flush()

    # 关闭新打开的两个窗口，回到本地宝主界面，继续爬取下一个城市
    driver.close()
    driver.switch_to.window(tab_2)
    driver.close()
    driver.switch_to.window(current_tab)
    time.sleep(1)

file.close()

效果展示：

在这里插入图片描述

四、武汉政策内容

程序源码：

import requests
from lxml import etree
import os
import re

# 发起请求，获取响应，获得相应内容
response = requests.get('http://wh.bendibao.com/live/202078/113158.shtm', headers={
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36"
})
html = etree.HTML(response.content.decode('utf8'))

# 如果文件存在，则删除后新建
file_path = 'wuhan_policy_content.txt'
if os.path.exists(file_path):
    os.remove(file_path)
file = open(file_path, 'a')

# 获取文章标题、发布时间、来源和导语等信息
title = html.xpath("//article[@id='news-article']/h1/text()")[0]
publish_time = html.xpath("//article[@id='news-article']//span[@class='public_time']/text()")[0]
introduction = html.xpath("//article[@id='news-article']//p[@class='dao']/text()")[0]

# 拼接结果信息并打印保存
info_str = "标题:" + title + "\n" + "时间:" + publish_time + "\n" + "导语:" + introduction + "\n\n"
print(info_str)
file.write(info_str)

# 获得具体内容列表
details_list = html.xpath("//article[@id='news-article']//div[@class='content-box']//text()")
for i in range(len(details_list)):
    # 去除每条内容中的制表符、空格、换行符
    details_list[i] = re.sub(r"\s+", "", details_list[i]).strip()
    # 经处理后当前内容若为空则不打印不保存
    if details_list[i] == "showtopcontent();" or details_list[i] == "":
        continue
    
    # 打印内容并保存到文件
    print(details_list[i] + "\n")
    file.write(details_list[i] + "\n\n")

# 关闭文件
file.close()

效果展示：

在这里插入图片描述

五、数据清洗&词云

程序源码：

import re
import collections
import jieba
import wordcloud
import matplotlib.pyplot as plt

# 以只读模式打开文件，读取内容
file = open('wuhan_policy_content.txt', 'r')
content = file.read()
file.close()

# 文本处理，定义正则表达式匹配模式，将符合模式的字符去除
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"')
content = re.sub(pattern, '', content)

# 文本分词，精确模式分词
seg_list_exact = jieba.cut(content, cut_all=False)
# 自定义去除词库
remove_words = [u'的', u'，', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。', u' ', u'、', u'中', u'在', u'了',
                u'通常', u'如果', u'我们', u'需要', u'0', u'1', u'2', '3', '4', '5', '6', '7', '8', '9', '《', '》', '12']
# 循环读出每个分词，如果不在去除词库中，分词追加到列表
object_list = []
for word in seg_list_exact:
    if word not in remove_words:
        object_list.append(word)

# 词频统计
word_counts = collections.Counter(object_list)
# 获取前 10 个高频词
word_counts_top10 = word_counts.most_common(10)
print(word_counts_top10)
word_counts_top10 = str(word_counts_top10)

wc = wordcloud.WordCloud(
    font_path='simfang.ttf',
    max_words=55,
    max_font_size=150,
    background_color='white',
    width=800, height=600,
)

# 从字典生成词云
wc.generate_from_frequencies(word_counts)
# 显示词云
plt.imshow(wc)
# 关闭坐标轴
plt.axis('off')
# 显示图像
plt.show()
wc.to_file('word_cloud.png')