最新研究了下爬虫,且将信息保存到EXCEL和sqlite; 代码供参考;
代码如下:
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
from bs4 import BeautifulSoup
import urllib.request,urllib.error
import sqlite3
import re
import openpyxl
def main():
# 1, 爬取网页
baseurl = "https://movie.douban.com/top250?start="
datalist = getData(baseurl)
# 2, 保存数据,分别保存到excel和sqlite DB中
savepath = ".\\豆瓣电影Top250.xlsx"
saveData(datalist, savepath)
saveDb(datalist)
print("爬取完毕")
# 影片详情
findLink = re.compile(r"<a href=(.*?)>") #创建正则表达式对象,表示规则(字符串模式)
# 图片地址
findImg = re.compile(r'<img.*src=(.*)" width="100"/>', re.S)
# 影片片名
findTitle = re.compile(r'<span class="title">(.*)</span>')
# 影片评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# 影片评价人数
findComment = re.compile(r'<span>(.*)人评价</span>')
# 影片概况
findInq = re.compile(r'<span class="inq">(.*)</span>')
#影片相关内容
findInfo = re.compile(r'<p class="">(.*?)<br/>', re.S) # re.S 让换行符包含在字符中
#根据传入的URL爬取豆瓣网站信息
def getData(baseurl):
url = baseurl
data_all = []
for page in range(0,10):
#获取10页的URL
url = baseurl + str(page*25)
html = askURL(url)
#2,逐一解析网页
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('div', class_="item"):
data = [] # 保存一部电影的信息
# print(item)
item = str(item)
link = re.findall(findLink, item)[0] #re库通过正则查找字符串
data.append(link)
# print('影片链接:', link)
img = re.findall(findImg, item)[0]
data.append(img)
# print('影片海报:', img)
title = re.findall(findTitle, item)[0]
data.append(title)
# print('影片名称:', title)
rating = re.findall(findRating, item)[0]
data.append(rating)
# print('影片评分:', rating)
comment = re.findall(findComment, item)[0]
data.append(comment)
# print('评价人数:', comment)
inq = re.findall(findInq, item)
if len(inq) != 0:
inq = inq[0].replace("。" , "")
data.append(inq.strip())
else:
data.append(" ")
# print('影片简介:', inq)
info = re.findall(findInfo, item)[0]
info = re.sub("<br(\s+)/>(\s+?)"," ",info)
info.replace("<br/>" , '')
data.append(info.strip())
# data.append('-------------------------------')
data_all.append(data)
return data_all
#根据传入的URL, 伪装为浏览器进行网站访问
def askURL(url):
# 伪装为浏览器
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
# 用户代理,告诉服务器,客户端的类型;本质上是告诉浏览器可以接收什么水平的数据
}
req = urllib.request.Request(url=url, headers=headers)
try:
#打开请求的网页
response = urllib.request.urlopen(req)
#获取HTML页面内容
html = response.read().decode('utf-8')
bs = BeautifulSoup(html, "html.parser")
t_list = bs.select(".title")
# for item in t_list:
# print(item.string)
# pat = re.compile("^[\u4e00-\u9fa5]{0,}$") #匹配中文
# name = pat.search(item.string)
# print(name)
# print(html)
except urllib.error.URLError as e:
if hasattr(e.code):
print(e.code)
if hasattr(e.reason):
print(e.reason)
return html
#将爬取结果保存到EXCEL
def saveData(datalist, savepath):
print('saving excel...')
workbook = openpyxl.Workbook() # 创建workbook对象
worksheet = workbook.create_sheet("sheet1") # 创建工作表单
#保存抬头
title = ['影片链接', '影片海报', '影片名称', '影片评分', '评价人数', '影片简介', '其他信息']
for i in range(0,7):
worksheet.cell(1, i + 1, title[i])
#保存内容
for row in range(0, len(datalist)):
for col in range(0, len(datalist[row])):
worksheet.cell(row + 2, col + 1, datalist[row][col])
workbook.save(savepath)
#将爬取结果保存到sqlite
def saveDb(datalist):
print('saving db...')
#创建连接
conn = sqlite3.connect('DoubanMovie.db')
#执行SQL
sql_create = '''
CREATE TABLE IF NOT EXISTS movie_top250(
id integer primary key,
link text,
pic text,
name text,
rate real,
rate_account real,
introduction text,
info text
)
'''
cursor = conn.cursor()
cursor.execute(sql_create)
#保存影片信息
try:
for item in datalist:
sql_insert = """insert into movie_top250(link,pic,name,rate,rate_account,introduction,info) values('{0}','{1}','{2}',{3},{4},"{5}","{6}")""".format(item[0],item[1],item[2],item[3],item[4],item[5],item[6])
cursor.execute(sql_insert)
except BaseException as e:
print('except...',e)
finally:
# 提交sql
conn.commit()
# 关闭连接
conn.close()
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
main()
EXCEL信息:
sqlite 信息: