目录
爬虫基本 + re/etree/beautifulsoup+保存本地/连接数据库
大致分为
爬虫基本 + re/etree/beautifulsoup+保存本地/连接数据库
基本
爬一个很简单的百度新闻热搜
爬排名 热搜名 和热搜指数
我们直接开始分析
其实这个页面很简单 就是在自己页面的源代码上
也不需要什么分析直接爬源代码即可
这里就是很简单的爬源代码
import requests
url='https://top.baidu.com/board?tab=realtime'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
} #伪造ua
def content():
resposne=requests.get(url=url,headers=headers)
resposne.encoding='utf-8'
content=resposne.text
print(content)
return content
if __name__ == "__main__":
content=content()
我们爬完的数据
这里就分为3个模块
re
输入放入里面开始分析
regex101: build, test, and debug regex
<div class="c-single-text-ellipsis"> (.*?) </div>
报错使用
<div class="c-single-text-ellipsis"> (.*?) <\/div>
这里我们就能发现规律
标题都是在 <div></div>中的
<div class="c-single-text-ellipsis"> (.*?) </div>
报错使用
<div class="c-single-text-ellipsis"> (.*?) <\/div>
这里我们就匹配了我们的标题
接下来要匹配热搜指数
同样的道理 进行匹配
<div class="hot-index_1Bl1a"> (.*?) <\/div>
这里我就得出了匹配代码 我们可以继续开始写了
import requests
import re
url='https://top.baidu.com/board?tab=realtime'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
} #伪造ua
def content():
resposne=requests.get(url=url,headers=headers)
resposne.encoding='utf-8'
content=resposne.text
# print(content)
return content
def re_text(content):
fire=re.findall("""<div class="hot-index_1Bl1a"> (.*?) </div>""",content)
title=re.findall("""<div class="c-single-text-ellipsis"> (.*?) </div>""",content)
print(fire,title)
print(len(fire),len(title))
if __name__ == "__main__":
content=content()
re_text(content)
这里就得出来数据
lxml/etree
这里我们使用工具xpath helper
shift+ctrl+x打开工具
按住shift
能提取出路径
然后我们开始删除前面的路径看看能不能进行贪婪提取
发现我们提取出来了所有标题
//div[@class='c-single-text-ellipsis']
那我们要提取热搜指数
也是一样的
//div[@class='hot-index_1Bl1a']
我们开始写代码
import requests
import re
from lxml import etree
url='https://top.baidu.com/board?tab=realtime'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
} #伪造ua
def content():
resposne=requests.get(url=url,headers=headers)
resposne.encoding='utf-8'
content=resposne.text
# print(content)
return content
def etree_text(content): #lxml etree
new_content=etree.HTML(content)
fire=new_content.xpath("""//div[@class='hot-index_1Bl1a']/text()""")
title=new_content.xpath("""//div[@class='c-single-text-ellipsis']/text()""")
print(fire,title)
if __name__ == "__main__":
content=content()
etree_text(content)
beautifulsoup
这个工具我自己感觉很好用
我们直接打开网站
对我们想要的数据进行检查
然后选择右键 复制 css路径
复制即可 然后我们开始调用 beautifulsoup的select函数开始查找即可 然后需要调用 .get_text()方法取得内容
标题和指数都是这样
import requests
from bs4 import BeautifulSoup
url='https://top.baidu.com/board?tab=realtime'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
} #伪造ua
def content():
resposne=requests.get(url=url,headers=headers)
resposne.encoding='utf-8'
content=resposne.text
# print(content)
return content
def beau_text(content):
soup=BeautifulSoup(content,'html.parser') #使用py自带的html.parser解析器
# print(soup.prettify()) #内容格式化输出
# print(soup.body)
title= soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.content_1YWBm a.title_dIF3B div.c-single-text-ellipsis')
title2=[]
for i in title:
title=i.get_text()
# print(title)
title2.append(title) #这里我们就取得了热搜名字
fire=soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.trend_2RttY.hide-icon div.hot-index_1Bl1a')
fire2=[]
for i in fire:
fire=i.get_text()
fire2.append(fire) #获取热搜指数
if __name__ == "__main__":
content=content()
beau_text(content)
这里三大项就结束了
我们开始保存本地和数据库
保存到本地
import requests
import re
from lxml import etree
from bs4 import BeautifulSoup
url='https://top.baidu.com/board?tab=realtime'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
} #伪造ua
def content():
resposne=requests.get(url=url,headers=headers)
resposne.encoding='utf-8'
content=resposne.text
# print(content)
return content
def to_txt(fire,title):
content=[]
for i in range(0,len(title)):
content.append(str(i)+fire[i]+title[i])
with open('百度新闻.txt','w',encoding='utf-8')as fp:
for i in content:
fp.write(i+'\n')
if __name__ == "__main__":
content=content()
fire,title=beau_text(content)
to_txt(fire,title)
传入数据库
我们先要在数据库中设立好
这里我的数据库是spider 表名为百度新闻
里面的字段
db=pymysql.connect(host="localhost",port=3306,user="root",passwd="111111",db="spider",charset="utf8")
连接数据库
cursor=db.cursor()
设置游标
全部代码
import requests
from bs4 import BeautifulSoup
import pymysql
url='https://top.baidu.com/board?tab=realtime'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
} #伪造ua
db=pymysql.connect(host="localhost",port=3306,user="root",passwd="214253551",db="spider",charset="utf8")
cursor=db.cursor() #连接数据库
def content():
resposne=requests.get(url=url,headers=headers)
resposne.encoding='utf-8'
content=resposne.text
# print(content)
return content
def beau_text(content):
soup=BeautifulSoup(content,'html.parser') #使用py自带的html.parser解析器
# print(soup.prettify()) #内容格式化输出
# print(soup.body)
title= soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.content_1YWBm a.title_dIF3B div.c-single-text-ellipsis')
title2=[]
for i in title:
title=i.get_text()
# print(title)
title2.append(title) #这里我们就取得了热搜名字
fire=soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.trend_2RttY.hide-icon div.hot-index_1Bl1a')
fire2=[]
for i in fire:
fire=i.get_text()
fire2.append(fire) #获取热搜指数
return fire2,title2
def to_data(fire,title):
data2=title
data3=fire
# sqli='delete from 百度新闻'
# cursor.execute(sqli)
# db.commit()
data1=[]
for i in range(0,len(title)):
data1.append(i+1)
sql="INSERT INTO 百度新闻 (id,title,fire) VALUES ( '" + str(data1[i]) + "', '" + data2[i] + "', '" + data3[i] + "');"
# print(sql)
try:
db.ping(reconnect=True)
cursor.execute(sql)
db.commit()
# print('ok')
except Exception as err:
# # 检查异常原因是否是感兴趣的
result1 = re.search('Duplicate entry.*key.*PRIMARY', str(err))
# # 如果是,什么都不用做
# # 否则(也不知道是什么原因),那就回滚吧
if (result1 == None):
# # 如果发生错误则回滚
db.rollback()
# 关闭数据库连接
db.close()
if __name__ == "__main__":
content=content()
fire,title=beau_text(content) #beautifulsoup
to_data(fire,title)
这里就是全部的代码了 数据库的代码
后面还有一个 是全部全类型的代码 这个只是简单的爬虫
import requests
import re
from lxml import etree
from bs4 import BeautifulSoup
import pymysql
url='https://top.baidu.com/board?tab=realtime'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
} #伪造ua
db=pymysql.connect(host="localhost",port=3306,user="root",passwd="214253551",db="spider",charset="utf8")
cursor=db.cursor()
def content():
resposne=requests.get(url=url,headers=headers)
resposne.encoding='utf-8'
content=resposne.text
# print(content)
return content
# def re_text(content): #re正则
# fire=re.findall("""<div class="hot-index_1Bl1a"> (.*?) </div>""",content)
# title=re.findall("""<div class="c-single-text-ellipsis"> (.*?) </div>""",content)
# print(fire,title)
# print(len(fire),len(title))
# def etree_text(content): #lxml etree
# new_content=etree.HTML(content)
# fire=new_content.xpath("""//div[@class='hot-index_1Bl1a']/text()""")
# title=new_content.xpath("""//div[@class='c-single-text-ellipsis']/text()""")
# print(fire,title)
def beau_text(content):
soup=BeautifulSoup(content,'html.parser') #使用py自带的html.parser解析器
# print(soup.prettify()) #内容格式化输出
# print(soup.body)
title= soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.content_1YWBm a.title_dIF3B div.c-single-text-ellipsis')
title2=[]
for i in title:
title=i.get_text()
# print(title)
title2.append(title) #这里我们就取得了热搜名字
fire=soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.trend_2RttY.hide-icon div.hot-index_1Bl1a')
fire2=[]
for i in fire:
fire=i.get_text()
fire2.append(fire) #获取热搜指数
return fire2,title2
# def to_txt(fire,title): #保存到本地
# content=[]
# for i in range(0,len(title)):
# content.append(str(i)+fire[i]+title[i])
# with open('百度新闻.txt','w',encoding='utf-8')as fp:
# for i in content:
# fp.write(i+'\n')
def to_data(fire,title):
data2=title
data3=fire
# sqli='delete from 百度新闻'
# cursor.execute(sqli)
# db.commit()
data1=[]
for i in range(0,len(title)):
data1.append(i+1)
sql="INSERT INTO 百度新闻 (id,title,fire) VALUES ( '" + str(data1[i]) + "', '" + data2[i] + "', '" + data3[i] + "');"
# print(sql)
try:
db.ping(reconnect=True)
cursor.execute(sql)
db.commit()
# print('ok')
except Exception as err:
# # 检查异常原因是否是感兴趣的
result1 = re.search('Duplicate entry.*key.*PRIMARY', str(err))
# # 如果是,什么都不用做
# # 否则(也不知道是什么原因),那就回滚吧
if (result1 == None):
# # 如果发生错误则回滚
db.rollback()
# 关闭数据库连接
db.close()
if __name__ == "__main__":
content=content()
# re_text(content) #正则
# etree_text(content) #lxml etree
fire,title=beau_text(content) #beautifulsoup
# to_txt(fire,title) #保存到本地
to_data(fire,title)