python使用requests模块请求网址,使用lxml模块中etree抓取数据,并使用time模块延时
爬取的页面为:
运行结果如下图所示:
python代码如下:
在这里插入代码片
# _*_ coding:utf _*_
# 邮箱:3195841740@qq.com
# 人员:21292
# 日期:2020/3/8 11:05
# 工具:PyCharm
import requests
from lxml import etree
import re
import time
headers = {
'Cookie': 'll="118375"; bid=LweMDRu6xy0; __utmz=30149280.1582607485.1.1.utmcsr=sogou.com|utmccn=(referral)|utmcmd=referral|utmcct=/link; __utmz=223695111.1583572638.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __yadk_uid=sIlSb4fUzktAAB7ns01bryACK9TG0Ytt; _vwo_uuid_v2=D4D430B7FCD55769AFD16F4AB7B8A5907|ae49228565fb206135f49f584eb2c78e; __gads=ID=a3adab5ce8eafc57:T=1583573105:S=ALNI_MYInfQ1FlG09Ho82DR2aEpSSXRC_Q; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583668047%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.502674428.1582607485.1583572636.1583668047.3; __utmb=30149280.0.10.1583668047; __utmc=30149280; __utma=223695111.2100654023.1583572638.1583572638.1583668047.2; __utmb=223695111.0.10.1583668047; __utmc=223695111; _pk_id.100001.4cf6=96f806704894c344.1583572638.2.1583668060.1583573242.',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
def print_movies(movies):
print("*"*100)
for each in movies:
time.sleep(0.5)
if each == '主演':
for x in range(len(movies['主演'])):
time.sleep(0.5)
if x == 0:
print(each,':',movies['主演'][x])
else:
print(' ',movies['主演'][x])
else:
print(each,':',movies[each])
print("*" * 100)
def HTML_spider_detial(url):
response = requests.get(url,headers = headers,allow_redirects=False)
text = response.content.decode('utf-8')
html = etree.HTML(text)
return html
def get_detial_urls(url):
html = HTML_spider_detial(url)
detial_urls = html.xpath('//div[@class = "info"]/div[1]//@href')
for detial_url in detial_urls:
print('正在请求网址:',detial_url,'中....................')
spider_detials_url(detial_url)
time.sleep(0.5)
def spider_detials_url(url):
movies = {}
html = HTML_spider_detial(url)
movie_name = html.xpath('//div[@id = "content"]/h1/span/text()')[0]
movies['电影名称'] = movie_name
movie_year = html.xpath('//div[@id = "content"]/h1/span/text()')[1]
movie_year = re.findall(r'[(](.*?)[)]',movie_year)[0]
movies['年份'] = movie_year
movie_drector = html.xpath('//div[@id = "info"]/span[1]/span[2]/a/text()')[0]
movies['导演'] = movie_drector
movie_value = html.xpath('//div[@class ="rating_self clearfix" ]/strong/text()')[0]
movies['豆瓣评分'] = movie_value
movie_actors = html.xpath('//div[@id = "info"]/span[3]/span[2]//a/text()')
movies['主演'] = movie_actors
infos = html.xpath('//div[@id = "info"]//text()')
for index in range(0,len(infos),1):
if infos[index] == "语言:":
movie_language = infos[index+1]
movies['语言'] = movie_language
elif infos[index] == '上映日期:':
movie_time = infos[index+2]
movies["上映时间"] = movie_time
elif infos[index] == "片长:":
movie_long = infos[index+2]
movies['片长'] = movie_long
movie_simple = html.xpath('//span[@property="v:summary"]/text()')[0].strip()
movies['电影简介'] = movie_simple
print_movies(movies)
if __name__ == '__main__':
for i in range(0,10,1):
count = i*25
url = 'https://movie.douban.com/top250?start='+str(count)+'&filter='
get_detial_urls(url)
修改代码,将数据储存到excel中
运行如下:
详细代码如下:
在这里插入代码片
# _*_ coding:utf _*_
# 邮箱:3195841740@qq.com
# 人员:21292
# 日期:2020/3/8 11:05
# 工具:PyCharm
import requests
from lxml import etree
import re
import time
import openpyxl
#创建headers 若爬取过程中出现错误,修改这两个参数
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',
'Cookie': 'll="118375"; bid=Gnw8x-tUTyQ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583715518%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=dac2da16aa651d16.1582123086.16.1583716968.1583677611.; __yadk_uid=skuIGmPsoBorvw32ahEZf6sqfam16Rtj; __utma=30149280.1846912402.1582123089.1583715518.1583728695.18; __utmz=30149280.1583677417.16.10.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1046665669.1582123089.1583677417.1583715518.16; __utmz=223695111.1583677417.15.9.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gads=ID=e685b433defaf33d:T=1582374422:S=ALNI_MbPZ69DTsUEApb-_etqVfoNXEAO5g; ct=y; _vwo_uuid_v2=D630691FC9560003A3D895CB985C1B204|b22c83cc5ece5c99126b175f0bd35663; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18932; __utmc=30149280; __utmc=223695111; dbcl2="189323804:LgqGltbQVkw"; ck=p11c; __utmb=30149280.6.9.1583728745418; __utmt=1'
}
#将爬取到的数据写入excel
def save_detials_movies(movies_list,count = [1]):
file = openpyxl.Workbook()
sheet = file.active
sheet.title = "电影数据"
#excel的标题
sheet['A1'] = '电影名称'
sheet['B1'] = '年份'
sheet['C1'] = '导演'
sheet['D1'] = '豆瓣评分'
sheet['E1'] = '主演'
sheet['F1'] = '语言'
sheet['G1'] = '上映时间'
sheet['H1'] = '片长'
sheet['I1'] = '电影简介'
number = count[0]+1
count[0] = number
#定义行数
number = 2
while number < len(movies_list):
for i in range(0, len(movies_list), 1):
sheet['A' + str(number)] = movies_list[number]['电影名称']
sheet['B' + str(number)] = movies_list[number]['年份']
sheet['C' + str(number)] = movies_list[number]['导演']
sheet['D' + str(number)] = movies_list[number]['豆瓣评分']
actor = ''
for each in movies_list[number]["主演"]:
actor = actor + ' ' + each
sheet['E' + str(number)] = actor
sheet['F' + str(number)] = movies_list[number]['语言']
sheet['G' + str(number)] = movies_list[number]['上映时间']
sheet['H' + str(number)] = movies_list[number]['片长']
sheet['I' + str(number)] = movies_list[number]['电影简介']
number = number + 1
#保存excel
file.save('豆瓣电影.xlsx')
#输出movies
def print_movies(movies):
print("*"*100)
for each in movies:
time.sleep(0.5)
if each == '主演':
for x in range(len(movies['主演'])):
time.sleep(0.5)
if x == 0:
print(each,':',movies['主演'][x])
else:
print(' ',movies['主演'][x])
else:
print(each,':',movies[each])
print("*" * 100)
#对网址进行解析,并返回
def HTML_spider_detial(url):
response = requests.get(url,headers = headers,allow_redirects=False)
text = response.content.decode('utf-8')
html = etree.HTML(text)
return html
#提取网页中的电影url
def get_detial_urls(url,count = [1]):
html = HTML_spider_detial(url)
detial_urls = html.xpath('//div[@class = "info"]/div[1]//@href')
#提取每页中详细的url地址
for detial_url in detial_urls:
print('正在请求第'+str(count[0])+'个网址:',detial_url,'中....................')
spider_detials_url(detial_url)
count[0] = count[0]+1
#延时
time.sleep(0.5)
#定义数组,将所有的电影数据储存在这个数组内
movies_list = []
#对电影数据的爬取,保存到movies字典中
def spider_detials_url(url):
#定义空的字典,用于储存爬取的数据
movies = {}
html = HTML_spider_detial(url)
#电影名称
movie_name = html.xpath('//div[@id = "content"]/h1/span/text()')[0]
movies['电影名称'] = movie_name
#电影上映时间
movie_year = html.xpath('//div[@id = "content"]/h1/span/text()')[1]
movie_year = re.findall(r'[(](.*?)[)]',movie_year)[0]
movies['年份'] = movie_year
#导演
movie_drector = html.xpath('//div[@id = "info"]/span[1]/span[2]/a/text()')[0]
movies['导演'] = movie_drector
#豆瓣评分
movie_value = html.xpath('//div[@class ="rating_self clearfix" ]/strong/text()')[0]
movies['豆瓣评分'] = movie_value
#电影演员
movie_actors = html.xpath('//div[@id = "info"]/span[3]/span[2]//a/text()')
movies['主演'] = movie_actors
infos = html.xpath('//div[@id = "info"]//text()')
for index in range(0,len(infos),1):
#电影语言
if infos[index] == "语言:":
movie_language = infos[index+1]
movies['语言'] = movie_language
#电影具体的上映日期
elif infos[index] == '上映日期:':
movie_time = infos[index+2]
movies["上映时间"] = movie_time
#电影时长
elif infos[index] == "片长:":
movie_long = infos[index+2]
movies['片长'] = movie_long
#电影简介
movie_simple = html.xpath('//span[@property="v:summary"]/text()')[0].strip()
movies['电影简介'] = movie_simple
#print_movies(movies)
movies_list.append(movies)
save_detials_movies(movies_list)
#开始运行爬虫
def start():
for i in range(0,10,1):
count = i*25
url = 'https://movie.douban.com/top250?start='+str(count)+'&filter='
get_detial_urls(url)
if __name__ == '__main__':
start()
可以抓取250条电影信息。