导入需要的库
import requests
from lxml import etree
import re
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
'Referer': 'https://movie.douban.com/top250?start=0&filter='}
获取每个页面电影的网址
每一页有很多电影,这部分代码是获取每一页中电影的网址,返回值是列表,包含每个电影的网址
def getMovieUrls(baseUrl):
try:
response = requests.get(baseUrl, headers = header,timeout=10)
text = response.text
# print(text)
html = etree.HTML(text)
except requests.exceptions.RequestException as e:
print(e)
try:
movies_urls = html.xpath('//ol[@class="grid_view"]//a/@href')
except:
print('获取电影url失败')
return movies_urls
获取电影的详细信息
这部分代码是获取每个电影的详细信息,我这里只是获取了电影的名字,排名,时长。其实movies_infos中已经包含了所有的信息,可以根据自己需要全部拿出来
def getDetail(url):
try:
response = requests.get(url, headers = header,timeout=10)
text = response.text
html = etree.HTML(text)
except requests.exceptions.RequestException as e:
print(e)
try:
movie_name = html.xpath('//div[@id="content"]//h1/span/text()')
movie_sorted = html.xpath('//div[@id="content"]/div[@class ="top250"]/span/text()')
movies_infos = html.xpath('//div[@id="info"]//text()')
except:
print('获取电影详情失败')
return False
movie = {} # 用于存放电影信息
p = re.compile(r'[/:]') # 将/:替换为空格,目的是让电影信息更容易获取
movies_infos = [re.sub(p,'',movies_info).strip() for movies_info in movies_infos]
movies_infos = [m for m in movies_infos if m != '']
movie['movie_name'] = movie_name
movie['movie_sorted'] = movie_sorted
movie['movie_name'] = movie_name
# print(movies_infos)
for index,movies_info in enumerate(movies_infos):
if movies_info == '片长':
movie['片长'] = movies_infos[index + 1]
elif movies_info == '语言':
movie['语言'] = movies_infos[index + 1]
return movie
爬虫
这部分代码首先创建每个页码的网址,每一页只需改变start={}即可。然后调用上边的两个函数,最后返回电影的详细信息并打印。
def spiderUrls():
baseUrl = 'https://movie.douban.com/top250?start={}&filter='
num = 1
for i in range(0,251,25):
baseUrl.format(i)
moveUrls = getMovieUrls(baseUrl)
for moveUrl in moveUrls:
movie = getDetail(moveUrl)
if movie:
print('第{}电影信息'.format(num))
else:
print("获取电影失败")
num += 1
print(movie)
## 主函数
```python
if __name__ == '__main__':
spiderUrls()
初学入门,请各位大神赐教