前言
这几天在学python,跟着B站做了一个爬取豆瓣Top250电影的项目,我这里数据库用的是Mysql。
一、什么是爬虫?
模拟客户端发送网络请求,接收请求响应,按照一定的规则自动地抓取互联网信息的程序。
二、使用步骤
1.抓取网页,并返回从网页中获取到的源码
通过url向服务器发起request请求,请求可以包含额外的header信息。
代码如下(示例):
#得到指定一个URL的网页内容
def askURL(url):
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36"}
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
2.解析数据
这里用了正则表达式进行解析
代码如下(示例):
def getData(baseurl):
datalist = []
for i in range(0,10):
url = baseurl + str(i*25)
html = askURL(url) #保存从网页中获取到的源码
#逐个解析数据
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"): #查找符合要求的字符串,形成列表
#print(item)
data = [] #保存一部电影的所有信息
item = str(item)
#正则表达式查找指定的字符
link = re.findall(findLink,item)[0]
data.append(link)
imgSrc = re.findall(findImgSrc,item)[0]
data.append(imgSrc)
titles = re.findall(findTitle, item)
if len(titles)==2:
ctitle = titles[0]
otitle = titles[1].replace("/","") #替换符号
data.append(ctitle)
data.append(otitle)
else:
data.append(titles[0])
data.append('') #占位留空
rating = re.findall(findRating, item)[0]
data.append(rating)
judgeNum = re.findall(findJudge, item)[0]
data.append(judgeNum)
inq = re.findall(findInq,item)
if len(inq) != 0:
inq = inq[0].replace("。","")
data.append(inq)
else:
data.append('')
bd = re.findall(findBd,item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?',"",bd)
bd = re.sub('/',"",bd)
data.append(bd.strip()) #去空格
datalist.append(data)
print(datalist)
return datalist
3.保存数据
首先我们要连接Mysql并创建一个表
代码如下(示例):
#创建一个数据表
def createTest():
DBHOST = 'localhost'
DBUSER = 'root'
DBPASS = 'admin123-' #密码
DBNAME = 'test' #数据库名字
try:
db = pymysql.connect(host=DBHOST, user=DBUSER, password=DBPASS, database=DBNAME)
cur = db.cursor() # 声明游标
cur.execute('DROP TABLE IF EXISTS doubanTest')
sql = '''
CREATE TABLE doubanTest
(
id INT PRIMARY KEY AUTO_INCREMENT,
info_link TEXT,
pic_link TEXT,
cname VARCHAR(255),
oname VARCHAR(255),
score FLOAT,
rated INT,
instroduction TEXT,
info text
);
'''
cur.execute(sql)
print("数据表创建成功!")
except pymysql.Error as e:
print("数据表创建失败!" + str(e))
然后我们要把解析完的数据塞进表里
代码如下(示例):
#保存数据
def saveData(datalist):
createTest()
DBHOST = 'localhost'
DBUSER = 'root'
DBPASS = 'admin123-' # 密码
DBNAME = 'test' # 数据库名字
try:
db = pymysql.connect(host=DBHOST, user=DBUSER, password=DBPASS, database=DBNAME)
cur = db.cursor() # 声明游标
for data in datalist:
for index in range(len(data)):
if index == 4 or index == 5:
continue
data[index] = '"'+data[index]+'"'
sql = '''
INSERT INTO doubantest(
info_link,
pic_link,
cname,
oname,
score,
rated,
instroduction,
info
) VALUE(%s)
'''%",".join(data) # 表名 字段名 插入的值
print(sql)
cur.execute(sql) # 将数据进行提交
db.commit() # 数据库关闭
print("数据插入成功!")
except pymysql.Error as e:
print("数据插入失败" + str(e))
db.rollback() # 数据插入失败返回原先状态
4.完成
完整代码如下:
# -*- coding = utf-8 -*-
# @Time : 2021/4/26 20:48
# @Author : chen
# @File : douban.py
# @Software : PyCharm
from bs4 import BeautifulSoup #网页解析,获取数据
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #指定URL,获取网页数据
import xlwt #进行Excel操作
import pymysql #进行mysql数据库操作
#创建正则表达式对象
findLink = re.compile(r'<a href="(.*?)">') #影片链接的规则
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S) #影片图片的规则 re.s换行符包括在字符中
findTitle = re.compile(r'<span class="title">(.*)</span>') #影片名的规则
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') #影片评分的规则
findJudge = re.compile(r'<span>(\d*)人评价</span>') #影片评价人数的规则
findInq = re.compile(r'<span class="inq">(.*)</span>') #影片概况的规则
findBd = re.compile(r'<p class="">(.*?)</p>',re.S) #影片导演的规则
def main():
baseurl = "https://movie.douban.com/top250?start="
# 爬取网页
datalist = getData(baseurl)
# 保存数据
saveData(datalist)
#爬取网页
def getData(baseurl):
datalist = []
for i in range(0,10):
url = baseurl + str(i*25)
html = askURL(url) #保存从网页中获取到的源码
#逐个解析数据
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"): #查找符合要求的字符串,形成列表
#print(item)
data = [] #保存一部电影的所有信息
item = str(item)
#正则表达式查找指定的字符
link = re.findall(findLink,item)[0]
data.append(link)
imgSrc = re.findall(findImgSrc,item)[0]
data.append(imgSrc)
titles = re.findall(findTitle, item)
if len(titles)==2:
ctitle = titles[0]
otitle = titles[1].replace("/","") #替换符号
data.append(ctitle)
data.append(otitle)
else:
data.append(titles[0])
data.append('') #占位留空
rating = re.findall(findRating, item)[0]
data.append(rating)
judgeNum = re.findall(findJudge, item)[0]
data.append(judgeNum)
inq = re.findall(findInq,item)
if len(inq) != 0:
inq = inq[0].replace("。","")
data.append(inq)
else:
data.append('')
bd = re.findall(findBd,item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?',"",bd)
bd = re.sub('/',"",bd)
data.append(bd.strip()) #去空格
datalist.append(data)
print(datalist)
return datalist
#得到指定一个URL的网页内容
def askURL(url):
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36"}
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#创建一个数据表
def createTest():
DBHOST = 'localhost'
DBUSER = 'root'
DBPASS = 'admin123-' #密码
DBNAME = 'test' #数据库名字
try:
db = pymysql.connect(host=DBHOST, user=DBUSER, password=DBPASS, database=DBNAME)
cur = db.cursor() # 声明游标
cur.execute('DROP TABLE IF EXISTS doubanTest')
sql = '''
CREATE TABLE doubanTest
(
id INT PRIMARY KEY AUTO_INCREMENT,
info_link TEXT,
pic_link TEXT,
cname VARCHAR(255),
oname VARCHAR(255),
score FLOAT,
rated NUMERIC,
instroduction TEXT,
info text
);
'''
cur.execute(sql)
print("数据表创建成功!")
except pymysql.Error as e:
print("数据表创建失败!" + str(e))
#保存数据
def saveData(datalist):
createTest()
DBHOST = 'localhost'
DBUSER = 'root'
DBPASS = 'admin123-' # 密码
DBNAME = 'test' # 数据库名字
try:
db = pymysql.connect(host=DBHOST, user=DBUSER, password=DBPASS, database=DBNAME)
cur = db.cursor() # 声明游标
for data in datalist:
for index in range(len(data)):
if index == 4 or index == 5:
continue
data[index] = '"'+data[index]+'"'
sql = '''
INSERT INTO doubantest(
info_link,
pic_link,
cname,
oname,
score,
rated,
instroduction,
info
) VALUE(%s)
'''%",".join(data) # 表名 字段名 插入的值
print(sql)
cur.execute(sql) # 将数据进行提交
db.commit() # 数据库关闭
print("数据插入成功!")
except pymysql.Error as e:
print("数据插入失败" + str(e))
db.rollback() # 数据插入失败返回原先状态
#入口
if __name__ == '__main__':
main()
总结
1、发起请求2、获取响应内容 3、解析内容 4、保存数据