import requests
from lxml import etree
import time
'''
思路:
1,确定想要爬取的小说及入口url
2,爬章节链接并通过字符串拼接得到所有章节详情页的
3,爬取书名
4,爬取每章的标题,爬取每章具体内容的文本
6,将每章小说以章节累加,并保存为一个单独的txt文件
'''
# 设置请求头
headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
url = 'http://www.xbiquzw.com/45_45290/'
#获取网页连接
def get_html(url):
code_html = requests.get(url,headers=headers) #发送请求
code_html.encoding='UTF-8' #编码格式为utf- 8
code_html_get = code_html.text #把网页代码文本话
soup = etree.HTML(code_html_get) #把代码爬过来
return soup #进行返回
#获取文章的网页连接
def get_list(url):
soup = get_html(url)
list_box = soup.xpath('//*[@id="list"]/dl/dd/a/@href')
book_lists = []
for i in list_box:
book_lists.append(url+i)
return book_lists
#获取书名
def get_book_title(url):
soup = get_html(url)
book_title = soup.xpath('//*[@id="info"]/h1/text()')
book_title = str(book_title)
return book_title
#获取章节名字
def get_title(url):
soup = get_html(url)
get_name = soup.xpath('//*[@id="list"]/dl/dd/a/text()')
return get_name
# 获取文章页 正文
def get_novel_content(url):
soup = get_html(url)
# 获得需要的正文内容
content = soup.xpath('//*[@id="content"]/text()')
return content
# 保存到本地
def save_novel(url):
book_lists = get_list(url)
book_title = get_book_title(url)
num = 1
with open(book_title+'.txt','a',encoding='utf-8') as f:
for url_list in book_lists:
chapter_title = get_title(url_list)
for t in chapter_title:
f.write(t)
chapter_content = get_novel_content(url_list)
for c in chapter_content:
f.write(c)
print(f'{num}')
num+=1
f.close()
if __name__ == '__main__':
save_novel(url)
上面的代码是爬取小说的代码,
import requests
from lxml import etree
import os
def spider_picture():
# 创建文件夹存放照片
if not os.path.exists("./day07图片解析-素材"):
os.mkdir("./day07图片解析-素材")
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36 Core/1.77.106.400 QQBrowser/10.9.4626.400'}
page_index = 1
# 有63页故循环63次
while(page_index <= 63):
# 第一页没有index参数,故先进行判断
if page_index == 1:
index = ''
else:
index = 'index_' + str(page_index) + '.html'
page_index += 1
# 拼接网址下一页
url = 'http://pic.netbian.com/shoujibizhi/' + index
# 发送请求
page_text = requests.get(url, headers).text
# 将数据保存到对象中
tree = etree.HTML(page_text)
url_img = []
# 这里使用两个for循环可以进行代码优化,取出[0]数组即可
for i in tree:
# 解析
src = i.xpath('//div[@class="alist"]/ul/li/a/img/@src')
for urls in src:
url_src = 'http://pic.netbian.com/' + urls
# 获取照片数据
page_src = requests.get(url=url_src, headers=headers).content
img_name = urls.split('/')[-1]
img_path = './day07图片解析-素材/' + img_name
with open(img_path, 'wb') as fp:
fp.write(page_src)
print(img_name, "保存成功!")
print("保存结束!")
这是爬取图片的代码
import pymysql
import requests
import os
from lxml import etree
import random
from urllib.request import build_opener,Request,ProxyHandler
from con123 import con
from xpath_spider import *
from picture import spider_picture
def insert_data():
#获取连接
user = input('输入用户名')
pwd = input('输入密码')
coursor = con.cursor()
sql4 = 'insert into account values (%s,%s)'
args = ((user,pwd))
coursor.execute(sql4,args)
con.commit()
coursor.close()
con.close()
def delete_data():
#获取连接
con = pymysql.connect(host="localhost",port=3308,user="root",password="root",db="test06",charset='utf8')
coursor = con.cursor()
#根据用户名来删除
user = input('输入你要删除的用户名')
sql = 'delete from account where user=%s'
args = (user)
coursor.execute(sql,args)
con.commit()
coursor.close()
con.close()
def auth_user(user,pwd):
con = pymysql.connect(host="localhost",port=3308,user="root",password="root",db="test06",charset='utf8')
coursor = con.cursor()
sql = 'select * from account where user=%s and pwd=%s'
args = (user,pwd)
coursor.execute(sql,args)
result = coursor.fetchone()
coursor.close()
con.close()
if result:
print('正确')
return True
else:
return False
a = input('输入用户名')
b = input('输入密码')
if auth_user(a,b):
spider_picture()
elif a =='spider' and b =='root':
save_novel(url)
else:
print('用户名登录false')
上面的是连接数据库的代码,博主有时间会开发一份基于其他方式连接数据库的方法