下面以一个爬取25章的小说内容 为例,存取到txt文档中和mysql数据库中。
import requests
import bs4
import pymysql
url='http://www.xbiquge.la/13/13959/'
res1=requests.get(url=url)
res1.encoding='utf-8'
# print(res1.text) #html 文本文件
bs=bs4.BeautifulSoup(res1.text,'html.parser')
xs_name=bs.h1.string
# 开启数据库
conn=pymysql.connect(host='localhost',
db='xiaoshuo',
user='root',
password='123456',
charset='utf8')
cursor=conn.cursor()
sql='insert into xs(xs_name,xs_content,xs_text) values(%s,%s,%s)'
n=0
for dd in bs.find_all('dd'): #find_all('dd')拿到的是多有的dd标签 dd变量每一次循环,都是一个dd标签
n+=1
link='http://www.xbiquge.la'+dd.a['href'] #获取某一个dd中的a标签
xs_content=dd.a.string # 拿当前标签的内容
# print(dd.a.text) # 拿所有内容
res2=requests.get(url=link)
res2.encoding='utf-8'
with open('aaa.txt','w',encoding='utf-8') as f:
f.write(res2.text)
with open('aaa.txt','r',encoding='utf-8') as f:
text=f.read()
bs2=bs4.BeautifulSoup(text,'html.parser')
#得到小说的具体内容
try:
xs_text=bs2.find_all('div',attrs={'id':'content'})[0].text
except:
xs_text=None
print(xs_text)
cursor.execute(sql,args=[xs_name,xs_content,xs_text])
if n==25:
break
conn.commit()
cursor.close()
conn.close()
实现结果如下: