import os
import re
import time
import requests
import openpyxl
from urllib.parse import urlencode
from bs4 import BeautifulSoup
class BaiDuSearch(object):
def __init__(self):
self.session = requests.session()
self.excel = None
self.headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
}
self.count = 3 # 获取页面深度;默认3页
self.time = 2 # 获取下一页间隔时间
# 获取请求状态
def get_status(self, url):
response = self.session.get(url, headers=self.headers)
if response.status_code == 200:
return response
else:
print("网络连接失败!")
return None
# 访问首页,建立连接
def index(self, url):
response = self.get_status(url)
if not response:
return False
return True
# 网页搜索
def search(self, url):
response = self.get_status(url)
html = response.text
return html
# 网页解析
@staticmethod
def parse(html):
title_url = {}
soup = BeautifulSoup(html, "html5lib")
content_lefts = soup.select("#content_left > .result.c-container")
for result in content_lefts:
a = result.select("h3 > a")
if not a:
continue
title = a[0].text
url = a[0].get("href")
title_url[title] = url
return title_url
# 初始化 excel
def initialize_excel(self, path):
if os.path.exists(path):
self.excel = openpyxl.load_workbook(path)
else:
self.excel = openpyxl.Workbook()
headline_data = {
"title": "url",
}
self.write_to_excel(path, headline_data)
# 写入excel文件
def write_to_excel(self, path, title_url):
# 获得所有sheet的名称
sheet_names = self.excel.sheetnames
# 根据sheet名字获得sheet
# sheet = read_excel["Sheet1"]
# 根据获取第一个sheet对象
sheet = self.excel[sheet_names[0]]
for title in title_url:
print(title, title_url[title])
sheet.append((title, title_url[title]))
self.excel.save(path)
# 翻页
def page_parse(self, url, html):
soup = BeautifulSoup(html, "html5lib")
pages = soup.select("#page > a")
for page in pages:
title = page.text
if title == "下一页>":
href = page.get("href")
self.count += 1
get_url = re.findall(r"(https://.*?)/", url)
url = get_url[0] + href
# print(href)
# print(url)
return url
# 获取翻页
def get_page(self, url, html):
page_url_list = []
for i in range(self.count):
url = self.page_parse(url, html)
print("获取下一页:", url)
page_url_list.append(url)
time.sleep(self.time)
# 网页搜索
html = self.search(url)
yield url
# print(len(page_url_list))
# return page_url_list
# 翻页
def next_page(self, url, html, path):
page_url_list = self.get_page(url, html)
for url in page_url_list:
# 网页搜索
html = self.search(url)
# 解析html
title_url = self.parse(html)
# 写入excel文件
self.write_to_excel(path, title_url)
def main(self):
# 首页
url = "https://www.baidu.com"
self.index(url)
# 初始化 ecxel
path = os.path.abspath(os.path.join(os.getcwd(), "百度搜索信息.xlsx"))
self.initialize_excel(path)
url = "https://www.baidu.com/s?"
data = {
'wd': '代理',
}
# 组建 url
url = url + urlencode(data)
# 网页搜索
html = self.search(url)
# 翻页
self.next_page(url, html, path)
self.excel.close() # 关闭excel
if __name__ == '__main__':
bd = BaiDuSearch()
bd.main()
python 百度搜索页抽取
最新推荐文章于 2024-03-30 16:44:27 发布