requests爬取豆瓣top250 movie

import requests
from time import time
import re

url = 'https://movie.douban.com/top250'

def fetch_page(url):
    response = requests.get(url)
    return response

def parse(url):
    response = fetch_page(url)
    page = response.content
    #print(page)

    fetch_list = set()
    result = []

    for title in re.findall(r'<a href=.*\s.*<span class="title">(.*)</span>', page):
        result.append(title)

    for postfix in re.findall(r'<a href="(\?start=.*?)"', page):
        fetch_list.add(url + postfix.decode())

    for url in fetch_list:
        response = fetch_page(url)
        page = response.content
        for title in re.findall(r'<a href=.*\s.*<span class="title">(.*)</span>', page):
            result.append(title)

    for i, title in enumerate(result, 1):
        #title = title.decode('utf-8').encode('utf-8')
        print('{} {}'.format(i,title))

def main():
    start=time()
    parse(url)
    end=time()
    print('Cost {} seconds'.format((end - start)))

if __name__ == '__main__':
    main()

没有更多推荐了,返回首页

私密
私密原因:
请选择设置私密原因
  • 广告
  • 抄袭
  • 版权
  • 政治
  • 色情
  • 无意义
  • 其他
其他原因:
120
出错啦
系统繁忙,请稍后再试

关闭