爬虫demo - 对豆瓣的爬取
从
https://movie.douban.com/top250?start=
获取排名前250的电影信息并保存到xsl文档和sqlite数据库中
数据爬取
urllib库/requests方法
head 保存用户信息,告诉服务器-“我不是爬虫” 按网页 F12/network获得用户信息 将head封装到urllib.request.Request类 urllib.request.urlopen(request)打开网页 response.read().decode(“utf-8”)读取网页html
# 得到指定一个url网页内容
def askURL(url):
head = {
"Cookie": 'douban-fav-remind=1; gr_user_id=4c11abd8-6f6c-478b-bcea-ca301f211557; _vwo_uuid_v2=DAF3A4BB603C231E6573F1726D9196D8B|9dc3b83692ffc0d76ca109e98e0d5f4e; viewed="30310517_1231244"; bid=43x_Pr3PjFU; __utmc=30149280; __utmc=223695111; __utmz=223695111.1612103465.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=LiQaZ9RXj2SgLLepXdmcuMqIXjkDZa9r; dbcl2="232334512:u1OZQ0QZ9Wg"; ck=Uniy; push_noty_num=0; push_doumail_num=0; __utma=30149280.2076513241.1581234158.1612542816.1612709363.8; __utmz=30149280.1612709363.8.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.23233; __utmb=30149280.2.10.1612709363; __utma=223695111.654026043.1612103465.1612542816.1612709370.4; __utmb=223695111.0.10.1612709370; _pk_ses.100001.4cf6=*; __gads=ID=110bc0c08292a04a-221341d0f6c500b1:T=1612709372:RT=1612709372:R:S=ALNI_MZ4knUx22fD3-Fub_fVkHiVdeXqjw; _pk_id.100001.4cf6=637c3acd86dad356.1612103464.4.1612709996.1612542816.'
, "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
HTML数据分析
BeaufifulSoup库(亮汤)
爬取网页
分析网址 https://movie.douban.com/top250?start=
后面加的是个数
for循环达到翻页的效果
https://movie.douban.com/top250?start=
https://movie.douban.com/top250?start=10
https://movie.douban.com/top250?start=20 …
https://movie.douban.com/top250?start=250
soup = BeautifulSoup(html, “html.parser”)用html方式解析text
用正则表达式匹配内容
def getData(baseurl):
datalist = []
for i in range(0, 10): # 翻页
url = baseurl + str(i*25)
html = askURL(url) # 保存网页源码
# 2.逐一提取数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
# print(item) #item的全部信息
data = [] # 保存一个电影的全部信息
item = str(item)
title = re.findall(findTitle, item)
if len(title) == 2: #可能有多个名字
data.append(title[0])
data.append(title[1].replace("/", ""))
else:
data.append(title)
data.append(' ')
imgsrc = re.findall(findImgSrc, item)[0]
data.append(imgsrc)
link = re.findall(findlink, item)[0]
data.append(link)
rating = re.findall(findRating, item)[0]
data.append(rating)
judgeNum = re.findall(findJudge, item)[0]
data.append(judgeNum)
inq = re.findall(findInq, item) #也许没有评价
if len(inq) != 0:
data.append(inq[0].replace("。 ", ""))
else:
data.append(" ")
bd = re.findall(findBd, item)[0]
bd = str(bd)
bd = re.sub('<br(\s+)?/>(\s)?', " ", bd) # 去掉<br/>
bd = re.sub(r'/', ' ', bd)
data.append(bd.strip())
datalist.append(data)
# print(datalist)
return datalist
re库–正则表达式
findlink = re.compile(r'<a href="(.*?)">', re.S) # 链接的格式
findImgSrc = re.compile(r'<img .*src="(.*?)"', re.S) # 图片的格式 re.S忽略换行符
findTitle = re.compile(r'<span class="title">(.*)</span>') # 影片片名
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') # 评分
findJudge = re.compile(r'<span>(\d*)人评价</span>') # 评价人数
findInq = re.compile(r'<span class="inq">(.*)</span>') # 评价
findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 导演信息
数据存储
xls(excel)保存
保存数据
def saveData(datalist, savepath):
print("save...")
wookbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = wookbook.add_sheet("豆瓣电影TOP250", cell_overwrite_ok=True)
col = ("电影中文名", "电影外文名", "照片", "链接", "评分", "评价数", "概述", "相关信息")
for i in range(0, 8):
worksheet.write(0, i, col[i])
for i in range(0, 250):
print("第%d条"%(i+1))
data = datalist[i]
for j in range(0, 8):
worksheet.write(i+1, j, data[j])
wookbook.save(savepath)
sqlite数据库保存
构建数据库
def init_db(dbpth):
sql = '''
create table doubanmovieTOP250
(id integer primary key autoincrement,
cname varchar,
ename varchar,
pic_link text,
info_link text,
rated numeric,
score numeric,
inq text,
bd text);
'''
conn = sqlite3.connect(dbpth)
c = conn.cursor()
c.execute(sql)
conn.commit()
conn.close()
保存到数据库
def saveData2(datalist, dbpath):
conn = sqlite3.connect(dbpath)
c = conn.cursor()
for data in datalist:
for index in range(len(data)):
if index == 4 or index == 5:
continue
data[index] = '"'+str(data[index])+'"'
sql = '''
insert into doubanmovieTOP250(cname, ename, pic_link, info_link, rated, score, inq, bd)
values(%s)''' % ",".join(data)
c.execute(sql)
conn.commit()
c.close()
conn.close()
全部代码
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import xlwt
import sqlite3
def main():
baseurl = "https://movie.douban.com/top250?start="
datalist = getData(baseurl)
# savepath = ".\\豆瓣电影Top250.xls"
# saveData(datalist, savepath)
# askURL(baseurl)
dbpath = "movie.db"
saveData2(datalist, dbpath)
findlink = re.compile(r'<a href="(.*?)">', re.S) # 链接的格式
findImgSrc = re.compile(r'<img .*src="(.*?)"', re.S) # 图片的格式 re.S忽略换行符
findTitle = re.compile(r'<span class="title">(.*)</span>') # 影片片名
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') # 评分
findJudge = re.compile(r'<span>(\d*)人评价</span>') # 评价人数
findInq = re.compile(r'<span class="inq">(.*)</span>') # 评价
findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 导演信息
# 爬取网页
def getData(baseurl):
datalist = []
for i in range(0, 10): # 翻页
url = baseurl + str(i*25)
html = askURL(url) # 保存网页源码
# 2.逐一提取数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
# print(item) #item的全部信息
data = [] # 保存一个电影的全部信息
item = str(item)
title = re.findall(findTitle, item)
if len(title) == 2: #可能有多个名字
data.append(title[0])
data.append(title[1].replace("/", ""))
else:
data.append(title)
data.append(' ')
imgsrc = re.findall(findImgSrc, item)[0]
data.append(imgsrc)
link = re.findall(findlink, item)[0]
data.append(link)
rating = re.findall(findRating, item)[0]
data.append(rating)
judgeNum = re.findall(findJudge, item)[0]
data.append(judgeNum)
inq = re.findall(findInq, item) #也许没有评价
if len(inq) != 0:
data.append(inq[0].replace("。 ", ""))
else:
data.append(" ")
bd = re.findall(findBd, item)[0]
bd = str(bd)
bd = re.sub('<br(\s+)?/>(\s)?', " ", bd) # 去掉<br/>
bd = re.sub(r'/', ' ', bd)
data.append(bd.strip())
datalist.append(data)
# print(datalist)
return datalist
# 保存数据
def saveData(datalist, savepath):
print("save...")
wookbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = wookbook.add_sheet("豆瓣电影TOP250", cell_overwrite_ok=True)
col = ("电影中文名", "电影外文名", "照片", "链接", "评分", "评价数", "概述", "相关信息")
for i in range(0, 8):
worksheet.write(0, i, col[i])
for i in range(0, 250):
print("第%d条"%(i+1))
data = datalist[i]
for j in range(0, 8):
worksheet.write(i+1, j, data[j])
wookbook.save(savepath)
def saveData2(datalist, dbpath):
conn = sqlite3.connect(dbpath)
c = conn.cursor()
for data in datalist:
for index in range(len(data)):
if index == 4 or index == 5:
continue
data[index] = '"'+str(data[index])+'"'
sql = '''
insert into doubanmovieTOP250(cname, ename, pic_link, info_link, rated, score, inq, bd)
values(%s)''' % ",".join(data)
c.execute(sql)
conn.commit()
c.close()
conn.close()
def init_db(dbpth):
sql = '''
create table doubanmovieTOP250
(id integer primary key autoincrement,
cname varchar,
ename varchar,
pic_link text,
info_link text,
rated numeric,
score numeric,
inq text,
bd text);
'''
conn = sqlite3.connect(dbpth)
c = conn.cursor()
c.execute(sql)
conn.commit()
conn.close()
# 得到指定一个url网页内容
def askURL(url):
head = {
"Cookie": 'douban-fav-remind=1; gr_user_id=4c11abd8-6f6c-478b-bcea-ca301f211557; _vwo_uuid_v2=DAF3A4BB603C231E6573F1726D9196D8B|9dc3b83692ffc0d76ca109e98e0d5f4e; viewed="30310517_1231244"; bid=43x_Pr3PjFU; __utmc=30149280; __utmc=223695111; __utmz=223695111.1612103465.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=LiQaZ9RXj2SgLLepXdmcuMqIXjkDZa9r; dbcl2="232334512:u1OZQ0QZ9Wg"; ck=Uniy; push_noty_num=0; push_doumail_num=0; __utma=30149280.2076513241.1581234158.1612542816.1612709363.8; __utmz=30149280.1612709363.8.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.23233; __utmb=30149280.2.10.1612709363; __utma=223695111.654026043.1612103465.1612542816.1612709370.4; __utmb=223695111.0.10.1612709370; _pk_ses.100001.4cf6=*; __gads=ID=110bc0c08292a04a-221341d0f6c500b1:T=1612709372:RT=1612709372:R:S=ALNI_MZ4knUx22fD3-Fub_fVkHiVdeXqjw; _pk_id.100001.4cf6=637c3acd86dad356.1612103464.4.1612709996.1612542816.'
, "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
if __name__ == "__main__":
main()
# init_db("movie.db")
print("爬取完成")