爬取豆瓣网正在上映电影信息(HTMLParser实现)

from urllib import request
from html.parser import HTMLParser
import json
class MovieParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.movies = []
    def handle_starttag(self, tag, attrs):
        # print("attrs  ", attrs)
        def _attr(attrlist,attrname):
            for attr in attrlist:
                if attr[0] == attrname:
                    return attr[1]
            return None

        if tag == 'li' and _attr(attrs,'data-title') and _attr(attrs,'data-category') == 'nowplaying':
            movie = {}
            movie['title'] = _attr(attrs,'data-title')
            movie['score'] = _attr(attrs,'data-score')
            movie['director'] = _attr(attrs,'data-director')
            movie['actors'] = _attr(attrs,'data-actors')
            self.movies.append(movie)
            print('%(title)s| %(score)s| %(director)s| %(actors)s' % movie)



def nowplaying(url):
    req = request.Request(url)
    req.add_header('User-Agent',
                  'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')
    s = request.urlopen(req).read()
    parser = MovieParser()
    parser.feed(s.decode('utf-8'))
    return parser.movies


if __name__ == "__main__":
    url = "https://movie.douban.com/nowplaying/wuhan/"
    movies = nowplaying(url)

    print('%s' % json.dumps(movies, sort_keys=True, indent=4, separators=(',', ': ')))
 
 
 
html.parser不了解的可以看一下以下官方文档的解释(只取了一点)
 
 
HTMLParser. handle_starttag ( tag, attrs )  This method is called to handle the start of a tag (e.g. <div id="main"> ). The tag argument is the name of the tag converted to lower case. The attrs argument is a list of (name, value) pairs containing the attributes found inside the tag’s <> brackets. The name will be translated to lower case, and quotes in the value have been removed, and character and entity references have been replaced. For instance, for the tag <A HREF="https://www.cwi.nl/"> , this method would be called as handle_starttag('a', [('href', 'https://www.cwi.nl/')]) . All entity references from html.entities are replaced in the attribute values.

As a basic example, below is a simple HTML parser that uses the HTMLParser class to print out start tags, end tags, and data as they are encountered:

from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        print("Encountered a start tag:", tag)

    def handle_endtag(self, tag):
        print("Encountered an end tag :", tag)

    def handle_data(self, data):
        print("Encountered some data  :", data)

parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
            '<body><h1>Parse me!</h1></body></html>')

The output will then be:

Encountered a start tag: html
Encountered a start tag: head
Encountered a start tag: title
Encountered some data  : Test
Encountered an end tag : title
Encountered an end tag : head
Encountered a start tag: body
Encountered a start tag: h1
Encountered some data  : Parse me!
Encountered an end tag : h1
Encountered an end tag : body
Encountered an end tag : html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值