豆瓣爬虫
import csv
import requests
from re import findall
def get_one_page(start=0):
url = f'https://movie.douban.com/top250?start={start}&filter='
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
html = response.text
names = findall(r'<img width="100" alt="(.+?)"', html)
info = findall(r'(?s)<p class="">(.+?)</p>', html)
info = [x.strip().split('\n')[-1].strip() for x in info]
times = []
countries = []
types = []
for x in info:
result = x.split(' / ')
times.append(result[0])
countries.append(result[1])
types.append(result[2])
score = findall(r'<span class="rating_num" property="v:average">(.+?)</span>', html)
comment = findall(r'<span>(\d+)人评价</span>', html)
data = map(lambda i1, i2, i3, i4, i5, i6: (i1, i2, i3, i4, i5, i6), names, score, comment, times, countries, types)
w2.writerows(data)
print('-------------------------------一页获取完成-----------------------')
def get_one_page2():
url = 'https://movie.douban.com/top250?start=0&filter='
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
html = response.text
result = findall(r'(?s)<img width="100" alt="(.+?)".+?<p class="">(.+?)</p>.+?<span class="rating_num" property="v:average">(.+?)</span>.+?<span>(\d+)人评价</span>', html)
print(result)
from datetime import datetime
from csv import writer
if __name__ == '__main__':
f = open('files/top250.csv', 'w', encoding='utf-8', newline='')
w2 = writer(f)
w2.writerow(['电影名称', '评分', '评论人数', '上映时间', '国家', '类型'])
for x in range(0, 226, 25):
get_one_page(x)
f.close()
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
response = requests.get('https://movie.douban.com/top250', headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
div_list = soup.select('.grid_view>li>div')
for x in div_list:
name = x.select_one('.title').text
score = x.select_one('.rating_num').text
comment = x.select('.star>span')[-1].text[:-3]
print(name, score, comment)