写的第一个还算有点复杂的Python的程序,有点意思,感觉Python的实用性和开发效率实在很优秀,O(∩_∩)O哈哈~
源代码在最后,有兴趣的可以试试跑一下。
爬虫地址豆瓣电影 Top 250.
效果展示
技术点
简单的面向对象
主要是
class Movie:
def __init__(self, rank, name, other_name, directors, actors, year, country, kind, star, persons,
quote, img_url):
self.rank = rank
self.name = name
self.other_name = other_name
self.directors = directors
self.actors = actors
self.year = year
self.country = country
self.kind = kind
self.star = star
self.persons = persons
self.quote = quote
self.img_url = img_url
def __str__(self) -> str:
return "排名: %s\n电影名: %s\n别名: %s\n导演: %s\n演员: %s\n年份: %s\n国家: %s\n类别: %s\n评分: %s\n评价人数: %s\n评价: %s\n" \
% (self.rank, self.name, self.other_name, self.directors, self.actors, self.year, self.country,
self.kind, self.star, self.persons, self.quote)
def toAttrList(self) -> List:
res = [self.rank, self.name, self.other_name, self.directors, self.actors, self.year, self.country,
self.kind, self.star, self.persons, self.quote]
return res
通过url下载图片
def download_jpg(img_url, img_name=""):
res = requests.get(img_url, headers=getHeader(),stream=True)
if len(img_name) == 0:
filename = img_url.split(":", 1)[1]
filename = filename.replace("/", ".")
else:
filename = img_name
# w表示可写, b是字节流
with open(filename, "wb") as f:
f.write(res.content)
写入Excel
# 创建表格
wb = openpyxl.Workbook()
# 创建表格的一个sheet
ws = wb.create_sheet(index=0, title='豆瓣电影Top250')
# 写入表头
ws.append(["排名", "电影名", "别名", "导演", "演员", "年份", "国家", "类别", "评分", "评价人数", "评价"])
urls = []
for movie in movies:
# 将一个 list类型作为表格的一行的写入 excel
ws.append(movie.toAttrList())
# 保存excel
wb.save("豆瓣电影Top250统计.xlsx")
使用os库
使用os获得操作系统的api,就像操作命令行一样。
os.mkdir("豆瓣电影Top250统计")
os.chdir("豆瓣电影Top250统计")
# # 批量下载图片
os.mkdir("豆瓣电影Top250图片保存")
os.chdir("豆瓣电影Top250图片保存")
爬虫模块
主要运用到 requests、bs4库进行解析网页,提取到我们想要的消息。
源代码
import random
import re
import os
from typing import List
import requests
from bs4 import BeautifulSoup
import openpyxl
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
# iPhone 6:
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
# 随机获取一个请求头
def getHeader():
return {'User-Agent': random.choice(user_agent)}
class Movie:
def __init__(self, rank, name, other_name, directors, actors, year, country, kind, star, persons,
quote, img_url):
self.rank = rank
self.name = name
self.other_name = other_name
self.directors = directors
self.actors = actors
self.year = year
self.country = country
self.kind = kind
self.star = star
self.persons = persons
self.quote = quote
self.img_url = img_url
def __str__(self) -> str:
return "排名: %s\n电影名: %s\n别名: %s\n导演: %s\n演员: %s\n年份: %s\n国家: %s\n类别: %s\n评分: %s\n评价人数: %s\n评价: %s\n" \
% (self.rank, self.name, self.other_name, self.directors, self.actors, self.year, self.country,
self.kind, self.star, self.persons, self.quote)
def toAttrList(self) -> List:
res = [self.rank, self.name, self.other_name, self.directors, self.actors, self.year, self.country,
self.kind, self.star, self.persons, self.quote]
return res
def download_jpg(img_url, img_name=""):
res = requests.get(img_url, headers=getHeader(),stream=True)
if len(img_name) == 0:
filename = img_url.split(":", 1)[1]
filename = filename.replace("/", ".")
else:
filename = img_name
with open(filename, "wb") as f:
f.write(res.content)
def craw() -> List:
top250Movies = []
image_urls = []
link = "https://movie.douban.com/top250"
for pageNumber in range(10):
# 挨个爬取10个网页。一个网页里有
url = link + "?start=" + str(pageNumber * 25)
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
res = requests.get(url, headers=getHeader(), timeout=1)
if res.status_code != 200:
continue
soup = BeautifulSoup(res.text, 'lxml')
itemList = soup.find_all("div", class_="item")
for item in itemList:
pic_tag = item.find("div", class_="pic")
rank = pic_tag.em.text
# print(rank)
movie_detailed_page_url = pic_tag.a['href']
# title = pic_tag.a.img['alt']
movie_img_url = pic_tag.a.img['src']
# print(movie_img_url, " ", rank)
info_tag = item.find("div", class_="info")
hd_tag = info_tag.find("div", class_="hd")
bd_tag = info_tag.find("div", class_="bd")
titleList = hd_tag.a.find_all("span", class_=["title", "other"])
titles = []
for title_el in titleList:
titles.append(title_el.text)
titles = "".join(titles)
titles = titles.split("/")
name = titles[0]
other_name = ""
if len(titles) > 1:
other_name = "".join(titles[1:])
first_p_tag = bd_tag.find("p")
# print(first_p_tag.text.strip())
descriptions = first_p_tag.contents
descriptions.pop(1)
directors_and_actors = descriptions[0].strip()
if not re.search("导演:", directors_and_actors) is None:
index_tuple0 = re.search("导演:", directors_and_actors).span()
if not re.search("主演:", directors_and_actors) is None:
index_tuple1 = re.search("主演:", directors_and_actors).span()
directors = ""
actors = ""
if not re.search("导演:", directors_and_actors) is None and not re.search("主演:",
directors_and_actors) is None:
directors = directors_and_actors[index_tuple0[1]:index_tuple1[0]].strip()
actors = directors_and_actors[index_tuple1[1]:].strip()
description = descriptions[1].split("/")
for i in range(len(description)):
description[i] = description[i].strip()
year = description[0]
country = description[1]
kind = description[2]
star_tag = bd_tag.find("div", class_="star")
star = star_tag.find("span", class_="rating_num").text
# print(star)
# print(star_tag,end='\n\n')
# div 会被解释为 换行符
persons = star_tag.contents[-2].text
# 正则的切片
index_tp = re.search("\d+", persons).span()
persons = persons[index_tp[0]:index_tp[1]]
# print(persons)
# 和上面一样,写一个元素是换行
quote_tag = bd_tag.find("p", class_="quote")
if type(quote_tag) != type(bd_tag):
continue
quote = quote_tag.text.strip()
# print(quote)
movie = Movie(rank=rank, name=name, other_name=other_name, directors=directors, actors=actors, year=year,
country=country, kind=kind, star=star, persons=persons, quote=quote, img_url=movie_img_url)
# print(movie)
top250Movies.append(movie)
return top250Movies
def save():
movies = craw()
# 写入 excel
wb = openpyxl.Workbook()
ws = wb.create_sheet(index=0, title='豆瓣电影Top250')
ws.append(["排名", "电影名", "别名", "导演", "演员", "年份", "国家", "类别", "评分", "评价人数", "评价"])
urls = []
for movie in movies:
ws.append(movie.toAttrList())
urls.append((movie.img_url, movie.name + ".jpg"))
os.mkdir("豆瓣电影Top250统计")
os.chdir("豆瓣电影Top250统计")
wb.save("豆瓣电影Top250统计.xlsx")
# # 批量下载图片
os.mkdir("豆瓣电影Top250图片保存")
os.chdir("豆瓣电影Top250图片保存")
for url in urls:
download_jpg(url[0],url[1])
save()