爬取豆瓣电影top250的电影信息
网页链接: https://movie.douban.com/top250
技术路线:requests - bs4 - re
输出:保存为csv格式文件
import re
import requests
from bs4 import BeautifulSoup
def getHTMLText(url, code="utf-8"):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = code
return r.text
except:
return "爬取出错!"
def parseText(text, movieInfo):
soup = BeautifulSoup(text, 'html.parser')
olTag = soup.find('ol', class_='grid_view')
details = olTag.find_all('li')
for detail in details:
movieRank = detail.find('em').text #电影排名
movieName = '《' + detail.find('span', class_='title').text + '》' #电影名称
movieScore = detail.find('span', class_='rating_num').text + '分' #电影评分
movieCommentNum = detail.find(text=re.compile('\d+人评价')).string #评价人数
movieReview = '"' + detail.find('span', class_='inq').text + '"' #电影短评
movieP = detail.find('p').text
movieP1 = movieP.split('\n')[1]
movieP2 = movieP.split('\n')[2]
movieDirector = movieP1.split('\xa0')[0].strip()[4:] #导演
movieYear = re.findall(r'\d{4}', movieP2)[0] #上映年份
movieCountry = movieP2.split('\xa0/\xa0')[-2] #制片国家