最近突然对python兴趣浓厚,在看过几本基础书籍后,便想动手开始做做小的实践,首当其冲的便是爬虫,在学习了很多大神的爬虫入门贴后,深感佩服,在这里感谢以下两位博主的精彩分享:
1.xlzd
2.Jecvay
其中,由于目前好多网站都采取了反爬行动,所以在学习的时候并不能完全采用上述两位博主的源码,但是参考价值很大。本文是在xlzd的基础篇的例子上稍微加了点爬取内容,即可以下载Top250的影片图片和对Top250评分进行提取。
代码如下:
# coding=utf-8
'''
爬取豆瓣电影Top250
'''
import requests
from bs4 import BeautifulSoup
import codecs # 字符转换
import re
from contextlib import closing
import os
DownLoad_url = 'https://movie.douban.com/top250'
''' 下载页面内容content '''
def download_page(url):
# 伪装浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.132 Safari/537.36'}
data = requests.get(url,headers=headers).text
return data
''' 解析html '''
def parse_html(html):
soup = BeautifulSoup(html,"html.parser")
movie_list_soup = soup.find('ol',attrs={'class':'grid_view'})
# 存放电影名字的列表
movie_name_list = []
movie_pic = {}
for movie_li in movie_list_soup.find_all('li'):
detail = movie_li.find('div',attrs={'class':'hd'})
movie_name = detail.find('span',attrs={'class':'title'}).getText()
score = movie_li.find('div',attrs={'class':'bd'})
movie_score = score.find('span',attrs={'class':'rating_num'}).getText()
movie_detail = movie_name + movie_score
movie_name_list.append(movie_detail)
# 跳转业面
next_page = soup.find('span',attrs={'class':'next'}).find('a')
if next_page:
return movie_name_list,DownLoad_url + next_page['href']
return movie_name_list,None
def getPic(data):
pic_list = re.findall(r'src="http.+?.jpg"',data)
return pic_list
def download_pic(url,name):
rootPath = 'F:\\PythonCode\\SavePath\\doubanTop250\\'
if not os.path.exists(rootPath):
os.makedirs(rootPath)
response = requests.get(url,stream=True)
pic_type = '.'+url.split('.')[-1]
with closing(requests.get(url,stream=True)) as response:
with open(rootPath+name+pic_type,'wb') as file:
for data in response.iter_content(128):
file.write(data)
def main():
print(download_page(DownLoad_url))
if __name__ == '__main__':
url = DownLoad_url
n = 1
with codecs.open('movies','w',encoding='utf-8') as fp:
while url:
html = download_page(url)
picdata = getPic(html)
movies_name = 0
movies, url = parse_html(html)
for picinfo in picdata:
# print(str(n) + '--->' +picinfo[5:-1])
# print(movies[movies_name])
download_pic(picinfo[5:-1], 'Top' + str(n) +'-'+ movies[movies_name])
print(movies[movies_name] + '下载完成!')
n = n + 1
movies_name += 1
fp.write(u'\n'.join(movies))