"""
爬取豆瓣电影Top250
"""
import os
import re
import time
import requests
from bs4 import BeautifulSoup
#开始导入网页
def download(url, page):
# print(f"正在爬取:{url}p
# pattern = re(r'(?<=<br/>).*?(?=<)')
html = requests.get(url).text # 这里不加text返回<Response [200]>
soup = BeautifulSoup(html, 'html.parser')#开始解析网页
lis = soup.select("ol li")#寻找到标记
for li in lis:
index = li.find('em').text
title = li.find('span', class_='title').text
rating = li.find('span', class_='rating_num').text
strInfo = re.search('(?<=<br/>).*?(?=<)', str(li.select_one(".bd p")), re.S | re.M).group().strip()#利用正则表达式匹配出来所需要的内容
infos = strInfo.split('/')#切分
year = infos[0].strip()
area = infos[1].strip()
type = infos[2].strip()
write_fo_file(index, title, rating, year, area, type)
page += 25
if page < 250:
time.sleep(2)
base_url = "https://movie.douban.com/top250?start={}&filter="
url = base_url.format(page)#自动翻页
download(url, page)
def write_fo_file(index, title, rating, year, area, type):
title1 = title
print(title1, rating)
list02 = ["title1", "rating"]
f = open('movie_top3.txt', 'w',encoding='utf-8')
f.writelines(list02)
f.write('\n')
f.closed
def main():
if os.path.exists('movie_top250.csv'):
os.remove('movie_top250.csv')
url = 'https://movie.douban.com/top250'
download(url, 0)
print("爬取完毕。")
if __name__ == '__main__':
main()
豆瓣top250爬取
最新推荐文章于 2024-05-13 15:07:13 发布