三种方法抓取猫眼电影top100信息

最新推荐文章于 2024-04-08 10:44:28 发布

ilove_itachi

最新推荐文章于 2024-04-08 10:44:28 发布

阅读量3.2k

点赞数

分类专栏： tensorflow入门 Python基础文章标签：正则表达式 xpath html

本文链接：https://blog.csdn.net/ilove_itachi/article/details/76064070

版权

Python基础同时被 2 个专栏收录

13 篇文章 0 订阅

订阅专栏

tensorflow入门

4 篇文章 0 订阅

订阅专栏

分别使用BeautifulSoup，xpath，和正则表达式提取猫眼电影top100的信息。程序很简单，就不解释了，直接上程序吧。

# coding:utf-8
import requests
import re
from lxml import html
from bs4 import BeautifulSoup

url = 'http://maoyan.com/board/4?' 

def getResponse(url,par=None):
    try:
        response = requests.get(url,params=par)#params的用法还是很重要的，自己上网查一查吧
        response.raise_for_status()
        response.encoding = 'utf-8'
        return response
    except:
        exit('url 解析失败')

def bs4_info(response):
    soup = BeautifulSoup(response.text,'html.parser')
    names = [i.a.string for i in soup.find_all(name='p',attrs='name')]
    stars = [i.string.strip() for i in soup.find_all(name='p',attrs='star')]
    times = [i.string for i in soup.find_all(name='p',attrs='releasetime')] 
    scores_tag = [i.contents for i in soup.find_all(name='p',attrs='score')]
    scores = [item[0].string + item[1].string for item in scores_tag]

    return names,stars,times,scores

def lxml_info(response):
    element_html = html.fromstring(response.content.decode('utf-8'))#这个地方注意一下，不加decode的话中文的显示会有问题

    names = element_html.xpath("//p[@class='name']/a/text()")#text后面要加括号
    stars = [i.strip() for i in element_html.xpath("//p[@class='star']/text()")]
    times = [i for i in element_html.xpath("//p[@class='releasetime']/text()")]
    scores_integer = element_html.xpath("//i[@class='integer']/text()")
    scores_fraction = element_html.xpath("//i[@class='fraction']/text()")
    scores = [scores[0] + scores[1] for scores in zip(scores_integer,scores_fraction)]

    return names,stars,times,scores

def re_info(response):
    text = response.text

    name_pattern = re.compile(r'<p class="name"><a href=".*?title="(.*?)" data-act')#括号表示要提取的内容
    time_pattern = re.compile(r'<p class="releasetime">上映时间：(.{10}).*?</p>')
    star_pattern = re.compile(r'<p class="star">.*?主演：(.*?)</p>',re.S)
    score_pattern = re.compile(r'<p class="score"><i class="integer">(.*?)</i><i class="fraction">(\d)</i></p>')

    names = name_pattern.findall(text)
    times = time_pattern.findall(text)
    stars = [x.strip() for x in star_pattern.findall(text)]
    scores = [score[0] + score[1] for score in score_pattern.findall(text)]

    return names,stars,times,scores

names,stars,times,scores = [],[],[],[]

for i in range(10):
    response = getResponse(url,par={'offset':str(10*i)})
    name,star,time,score = re_info(response)
    names += name
    stars += star
    times += time
    scores += score