百度热搜的各种爬取方法:xpath re bs4

前言

用各种方法对网页内容进行筛选,算上是一个早期练习
结论:xpath简单,re兼容性好,bs4极简(但是个人不太喜欢)

re

import requests
import re


url="https://www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B"

header={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
}

response=requests.get(url,headers=header)

data=response.content.decode()

print("实时热搜")
#<a href="https://www.baidu.com/s?tn=SE_PcFYBssrd_na11wmj5&amp;wd=%E6%B9%96%E5%8D%97%E5%90%89%E6%9E%97%E8%B4%B5%E5%B7%9E%E4%BA%91%E5%8D%97%E7%9C%81%E5%A7%94%E4%B9%A6%E8%AE%B0%E8%B0%83%E6%95%B4&amp;rsv_dl=0_left_fyb_doodle" target="_blank" class="OP_LOG_LINK" data-click="{fm:'beha'}"> <div class="c-gap-top-small c-gap-bottom-small">
title_rule=re.compile('<span class="c-index.*</span>[\s\S]?\s*(.*)[\s\S]?')
link_rule=re.compile('<a href="(.*?)".*?class="OP_LOG_LINK" data-click="{fm:\'beha\'}">')
searchindex_rule=re.compile('<div class="op-hotboard-search-index">\s*(.*)\s*<i class="opr-toplist-st c-icon c-icon-down"></i>')
titles=title_rule.findall(data)
links=link_rule.findall(data)
searchindexs=searchindex_rule.findall(data)


for i in range(0,5):
    print(titles[i])
    print(links[i])
    print(searchindexs[i])



xpath_pc

import requests
from lxml import etree


url="https://www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B"

header={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
    # "User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.93 Mobile Safari/537.36"
}

response=requests.get(url,headers=header)
print(response.status_code)
data=response.content
with open("baidu.html","wb") as f:
    f.write(data)

x_data= etree.HTML(data)
print(x_data)
print("实时热搜")
result_href=x_data.xpath('//*[@class="op-hotboard-hotnews-body"]//@href')
result_title_raw=x_data.xpath('//*[@id="1"]/div[1]/div/ul/li/a/div/div[1]/text()')
result_description=x_data.xpath('//*[@id="1"]/div[1]/div/ul/li/a/div[2]/text()')
# print(result_title_raw)
print(len(result_href))
# print(result_href)
# print(len(result_title_raw))
# print(result_description)
print(len(result_description))
result_title_final=[]
for i in range(1,45,3):
    result_title_final.append(result_title_raw[i].strip())
print(result_title_final)
print(len(result_title_final))
for i in range(len(result_href)):
    print(result_title_final[i])
    print(result_href[i].strip())
    print(result_description[i].strip()+"\n")





xpath_mobile

import requests
from lxml import etree


url="https://www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B"

header={
    # "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
    "User-Agent": "Mozilla/5.0 (Linux; Android 10; SM-G975U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.93 Mobile Safari/537.36"
}

response=requests.get(url,headers=header)

data=response.content
with open("baidu.html","wb") as f:
    f.write(data)

x_data= etree.HTML(data)
print(x_data)
print("实时热搜")

news_timely=x_data.xpath('//*[@id="results"]/div[1]/div/article/section/div[2]/div/a/div/div[1]/div/span[2]/text()')
print(len(news_timely))
print(news_timely)




bs4

import requests
from bs4 import BeautifulSoup

url="https://www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B"

header={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36"
}

response=requests.get(url,headers=header)

data=response.content
with open("baidu.html","wb") as f:
    f.write(data)

soup= BeautifulSoup(data,"lxml")
print("实时热搜")
for i in range(0,5):
    result1=soup.select('.op-hotboard-hotnews-list-title')[i].get_text(strip=True)
    result2=soup.find_all(attrs={"class":"OP_LOG_LINK","data-click":"{fm:'beha'}"})[i].get('href')
    print(result1+result2)


print("今日热搜")
for i in range(5,10):
    result1=soup.select('.op-hotboard-hotnews-list-title')[i].get_text().replace(" ", "").replace("\n","")
    result2=soup.find_all(attrs={"class":"OP_LOG_LINK","data-click":"{fm:'beha'}"})[i].get('href')
    print(result1+" 链接 "+result2)

print("七日热搜")
for i in range(10,15):
    result1=soup.select('.op-hotboard-hotnews-list-title')[i].get_text().replace(" ", "").replace("\n","")
    result2=soup.find_all(attrs={"class":"OP_LOG_LINK","data-click":"{fm:'beha'}"})[i].get('href')
    print(result1+" 链接 "+result2)



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值