requests爬取豆瓣top250 movie

import requests
from time import time
import re

url = 'https://movie.douban.com/top250'

def fetch_page(url):
    response = requests.get(url)
    return response

def parse(url):
    response = fetch_page(url)
    page = response.content
    #print(page)

    fetch_list = set()
    result = []

    for title in re.findall(r'<a href=.*\s.*<span class="title">(.*)</span>', page):
        result.append(title)

    for postfix in re.findall(r'<a href="(\?start=.*?)"', page):
        fetch_list.add(url + postfix.decode())

    for url in fetch_list:
        response = fetch_page(url)
        page = response.content
        for title in re.findall(r'<a href=.*\s.*<span class="title">(.*)</span>', page):
            result.append(title)

    for i, title in enumerate(result, 1):
        #title = title.decode('utf-8').encode('utf-8')
        print('{} {}'.format(i,title))

def main():
    start=time()
    parse(url)
    end=time()
    print('Cost {} seconds'.format((end - start)))

if __name__ == '__main__':
    main()

阅读更多
想对作者说点什么?
相关热词

博主推荐

换一批

没有更多推荐了,返回首页