python 百度搜索页抽取

python 百度搜索页抽取


百度搜索.py

import os
import re
import time
import requests
import openpyxl
from urllib.parse import urlencode
from bs4 import BeautifulSoup


class BaiDuSearch(object):
    def __init__(self):
        self.session = requests.session()
        self.excel = None
        self.headers = {
            "Connection": "keep-alive",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
                          " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
        }
        self.count = 3  # 获取页面深度;默认3页
        self.time = 2  # 获取下一页间隔时间

    # 获取请求状态
    def get_status(self, url):
        response = self.session.get(url, headers=self.headers)
        if response.status_code == 200:
            return response
        else:
            print("网络连接失败!")
            return None

    # 访问首页,建立连接
    def index(self, url):
        response = self.get_status(url)
        if not response:
            return False
        return True

    # 网页搜索
    def search(self, url):
        response = self.get_status(url)
        html = response.text
        return html

    # 网页解析
    @staticmethod
    def parse(html):
        title_url = {}
        soup = BeautifulSoup(html, "html5lib")
        content_lefts = soup.select("#content_left > .result.c-container")
        for result in content_lefts:
            a = result.select("h3 > a")
            if not a:
                continue
            title = a[0].text
            url = a[0].get("href")
            title_url[title] = url
        return title_url

    # 初始化 excel
    def initialize_excel(self, path):
        if os.path.exists(path):
            self.excel = openpyxl.load_workbook(path)
        else:
            self.excel = openpyxl.Workbook()

            headline_data = {
                "title": "url",
            }
            self.write_to_excel(path, headline_data)

    # 写入excel文件
    def write_to_excel(self, path, title_url):
        # 获得所有sheet的名称
        sheet_names = self.excel.sheetnames
        # 根据sheet名字获得sheet
        # sheet = read_excel["Sheet1"]
        # 根据获取第一个sheet对象
        sheet = self.excel[sheet_names[0]]

        for title in title_url:
            print(title, title_url[title])
            sheet.append((title, title_url[title]))

        self.excel.save(path)

    # 翻页
    def page_parse(self, url, html):
        soup = BeautifulSoup(html, "html5lib")
        pages = soup.select("#page > a")
        for page in pages:
            title = page.text
            if title == "下一页>":
                href = page.get("href")
                self.count += 1
                get_url = re.findall(r"(https://.*?)/", url)
                url = get_url[0] + href
                # print(href)
                # print(url)
                return url

    # 获取翻页
    def get_page(self, url, html):
        page_url_list = []
        for i in range(self.count):
            url = self.page_parse(url, html)
            print("获取下一页:", url)
            page_url_list.append(url)
            time.sleep(self.time)
            # 网页搜索
            html = self.search(url)
            yield url
        # print(len(page_url_list))
        # return page_url_list

    # 翻页
    def next_page(self, url, html, path):
        page_url_list = self.get_page(url, html)
        for url in page_url_list:
            # 网页搜索
            html = self.search(url)

            # 解析html
            title_url = self.parse(html)

            # 写入excel文件
            self.write_to_excel(path, title_url)

    def main(self):
        # 首页
        url = "https://www.baidu.com"
        self.index(url)

        # 初始化 ecxel
        path = os.path.abspath(os.path.join(os.getcwd(), "百度搜索信息.xlsx"))
        self.initialize_excel(path)

        url = "https://www.baidu.com/s?"
        data = {
            'wd': '代理',
        }

        # 组建 url
        url = url + urlencode(data)

        # 网页搜索
        html = self.search(url)

        # 翻页
        self.next_page(url, html, path)
		
		self.excel.close()  # 关闭excel


if __name__ == '__main__':
    bd = BaiDuSearch()
    bd.main()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值