分别使用BeautifulSoup,xpath,和正则表达式提取猫眼电影top100的信息。程序很简单,就不解释了,直接上程序吧。
# coding:utf-8
import requests
import re
from lxml import html
from bs4 import BeautifulSoup
url = 'http://maoyan.com/board/4?'
def getResponse(url,par=None):
try:
response = requests.get(url,params=par)#params的用法还是很重要的,自己上网查一查吧
response.raise_for_status()
response.encoding = 'utf-8'
return response
except:
exit('url 解析失败')
def bs4_info(response):
soup = BeautifulSoup(response.text,'html.parser')
names = [i.a.string for i in soup.find_all(name='p',attrs='name')]
stars = [i.string.strip() for i in soup.find_all(name='p',attrs='star')]
times = [i.string for i in soup.find_all(name='p',attrs='releasetime')]
scores_tag = [i.contents for i in soup.find_all(name='p',attrs='score')]
scores = [item[0].string + item[1].string for item in scores_tag]
return names,stars,times,scores
def lxml_info(response):
element_html = html.fromstring(response.content.decode('utf-8'))#这个地方注意一下,不加decode的话中文的显示会有问题
names = element_html.xpath("//p[@class='name']/a/text()")#text后面要加括号
stars = [i.strip() for i in element_html.xpath("//p[@class='star']/text()")]
times = [i for i in element_html.xpath("//p[@class='releasetime']/text()")]
scores_integer = element_html.xpath("//i[@class='integer']/text()")
scores_fraction = element_html.xpath("//i[@class='fraction']/text()")
scores = [scores[0] + scores[1] for scores in zip(scores_integer,scores_fraction)]
return names,stars,times,scores
def re_info(response):
text = response.text
name_pattern = re.compile(r'<p class="name"><a href=".*?title="(.*?)" data-act')#括号表示要提取的内容
time_pattern = re.compile(r'<p class="releasetime">上映时间:(.{10}).*?</p>')
star_pattern = re.compile(r'<p class="star">.*?主演:(.*?)</p>',re.S)
score_pattern = re.compile(r'<p class="score"><i class="integer">(.*?)</i><i class="fraction">(\d)</i></p>')
names = name_pattern.findall(text)
times = time_pattern.findall(text)
stars = [x.strip() for x in star_pattern.findall(text)]
scores = [score[0] + score[1] for score in score_pattern.findall(text)]
return names,stars,times,scores
names,stars,times,scores = [],[],[],[]
for i in range(10):
response = getResponse(url,par={'offset':str(10*i)})
name,star,time,score = re_info(response)
names += name
stars += star
times += time
scores += score