此项目目的是爬取网页(netbian)中的图片信息储存到数据库,包括图片名称、缩略图url、大图url
插入到数据库后的效果:
1、安装pymysql
pymysql是用于python连接MySQL数据库,性质类似于jdbc
安装:使用pycharm终端输入:
pip install pymysql
2、建立picture类
建这个类是方便后续统一储存图片信息,方便后续执行插入操作
这里是新建的py文件,后续需要导包
class Picture:
def __init__(self, name, src, url):
self.name = name
self.src = src
self.url = url
3、导包
requests发起请求,接收响应
lxml解析html网页
pymysql连接MySQL数据库
picture为自定义类
import requests
from lxml import etree
import pymysql
import picture
4、数据库初始操作
# 建立数据库连接
conn = pymysql.connect(
host="localhost",
user="root",
password="123456",
database="数据库名称"
)
# 创建一个游标对象
cursor = conn.cursor()
5、爬虫操作
包括发起请求、接收响应、解析网页,最后将网页中的信息保存在列表中
URL = 'https://pic.netbian.com/4kdongman/index_'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
}
# the list of request url
URLList = []
# result
picList = []
nameList = []
srcList = []
urlList = []
fr = int(input('page from:'))
to = int(input('page to:'))
for i in range(fr, to):
URLList.append(URL + str(i) + '.html')
# print(srcList)
for Url in URLList:
response = requests.get(Url, headers)
response.encoding = 'gbk'
html = response.text
# print(html)
tree = etree.HTML(html)
# the list of picture name over
nameList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/img/@alt')
# the list of picture src over
srcList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/img/@src')
for i in range(len(srcList)):
srcList[i] = 'https://pic.netbian.com' + srcList[i]
# print(nameList)
# print(srcList)
aList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/@href')
for i in range(len(aList)):
aList[i] = 'https://pic.netbian.com' + aList[i]
# print(aList)
# request of detail.html second res
for a in aList:
detail_response = requests.get(a, headers)
detail_response.encoding = 'gbk'
detail_html = detail_response.text
# print(html)
detail_tree = etree.HTML(detail_html)
# the list of picture url over
url = detail_tree.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/div[2]/a/img/@src')
url[0] = 'https://pic.netbian.com' + url[0]
# print(url[0])
urlList.append(url[0])
detail_response.close()
# print(urlList)
# print(len(nameList), len(srcList), len(urlList))
# 封装类 picList over
for i in range(len(nameList)):
picList.append(picture.Picture(nameList[i], srcList[i], urlList[i]))
6、插入到数据库
将已经存储在列表中的数据插入到数据库,并使用异常处理
# 使用循循环将数据插入表中
try:
# insert
for value in picList:
cursor.execute('INSERT INTO picture (name, src, url) VALUES (%s, %s, %s)',
(value.name, value.src, value.url))
# 提交更改
conn.commit()
print('第' + str(fr) + '页ok')
fr += 1
except:
print('插入失败')
response.close()
# 关闭游标和连接
cursor.close()
conn.close()
7、完整代码
import requests
from lxml import etree
import pymysql
import picture
if __name__ == "__main__":
# 建立数据库连接
conn = pymysql.connect(
host="localhost",
user="root",
password="123456",
database="数据库名称"
)
# 创建一个游标对象
cursor = conn.cursor()
URL = 'https://pic.netbian.com/4kdongman/index_'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
}
# the list of request url
URLList = []
# result
picList = []
nameList = []
srcList = []
urlList = []
fr = int(input('page from:'))
to = int(input('page to:'))
for i in range(fr, to):
URLList.append(URL + str(i) + '.html')
# print(srcList)
for Url in URLList:
response = requests.get(Url, headers)
response.encoding = 'gbk'
html = response.text
# print(html)
tree = etree.HTML(html)
# the list of picture name over
nameList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/img/@alt')
# the list of picture src over
srcList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/img/@src')
for i in range(len(srcList)):
srcList[i] = 'https://pic.netbian.com' + srcList[i]
# print(nameList)
# print(srcList)
aList = tree.xpath('/html/body/div[2]/div/div[3]/ul/li/a/@href')
for i in range(len(aList)):
aList[i] = 'https://pic.netbian.com' + aList[i]
# print(aList)
# request of detail.html second res
for a in aList:
detail_response = requests.get(a, headers)
detail_response.encoding = 'gbk'
detail_html = detail_response.text
# print(html)
detail_tree = etree.HTML(detail_html)
# the list of picture url over
url = detail_tree.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/div[2]/a/img/@src')
url[0] = 'https://pic.netbian.com' + url[0]
# print(url[0])
urlList.append(url[0])
detail_response.close()
# print(urlList)
# print(len(nameList), len(srcList), len(urlList))
# 封装类 picList over
for i in range(len(nameList)):
picList.append(picture.Picture(nameList[i], srcList[i], urlList[i]))
# 使用循循环将数据插入表中
try:
# insert
for value in picList:
cursor.execute('INSERT INTO picture (name, src, url) VALUES (%s, %s, %s)',
(value.name, value.src, value.url))
# 提交更改
conn.commit()
print('第' + str(fr) + '页ok')
fr += 1
except:
print('插入失败')
response.close()
# 关闭游标和连接
cursor.close()
conn.close()