import os
import urllib.request as request
from urllib.request import Request,urlopen,urlretrieve
'''
解压缩或压缩
'''
import gzip
'''
ssl协议验签
'''
import ssl
'''
解析数据
'''
from lxml import etree
'''
连接数据库
'''
import pymysql
import time
def getURLData(path,charset):
'''
下载数据
:param path: 网址
:param charset: 编码格式
:return: 下载到的数据
'''
ssl._create_default_https_context = ssl._create_unverified_context
headers={
"Accept-Encoding":"gzip, deflate, br",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
req = Request(url=path,headers=headers)
conn=urlopen(req)
if conn.code == 200:
data = conn.read()
if conn.headers["Content-Encoding"] == "gzip":
data = gzip.decompress(data)
data = data.decode(encoding=charset)
return data
else:
return ""
db = pymysql.connect(host="127.0.0.1",user="root",password="root",db="test")
cursor = db.cursor()
if __name__ == '__main__':
bookid=0
bookpath="https://www.6yzw.com/63_63834/"
bookdata = getURLData(path=bookpath,charset="gbk")
html=etree.HTML(bookdata)
booknameTag="//div[@id='info']/h1/text()"
bookname=html.xpath(booknameTag)[0]
print("bookname",bookname)
bookAuthorTag="//div[@id='info']/p/text()"
bookAuthor=html.xpath(bookAuthorTag)[2]
print("bookAuthor",bookAuthor[7:])
bookType=html.xpath(bookAuthorTag)[0]
print("bookType",bookType[3:])
try:
cursor.execute("INSERT INTO novallist(bookname,bookauthor,booktype) VALUES(\"" + str(bookname) + "\",\"" + str(
bookAuthor[7:]) + "\",\"" + str(bookType[3:]) + "\")")
print("输出最新插入行的id",db.insert_id())
bookid=db.insert_id()
db.commit()
print("add a new user successful")
except Exception as e:
print("add a new user failed:", e)
db.rollback()
bookListHrefTag="//dl//dd/a/@href"
bookListHref=html.xpath(bookListHrefTag)
booksHref=[]
for bookhref in bookListHref:
cateurl="https://www.6yzw.com"+bookhref
booksHref.append(cateurl)
bookContentData=getURLData(path="https://www.6yzw.com"+bookhref,charset="gbk")
Contenthtml = etree.HTML(bookContentData)
contentTag="//div[@id='content']/text()"
content=Contenthtml.xpath(contentTag)
temp=''
for i in content:
temp=temp+i
print("temp",temp)
print("在这里把每一章存到数据库",content[0])
print("#####################################################################################")
try:
cursor.execute(
"INSERT INTO novalcate(bookid,catehref,content,cataname) VALUES(\"" + str(bookid) + "\",\"" + str(
cateurl) + "\",\"" + str(temp) + "\",\"" + str(content[0]) + "\")")
db.commit()
print("add a new user successful")
except Exception as e:
print("add a new user failed:", e)
db.rollback()
print("bookListHref",booksHref)