import requests
from time import time
import re
url = 'https://movie.douban.com/top250'
def fetch_page(url):
response = requests.get(url)
return response
def parse(url):
response = fetch_page(url)
page = response.content
#print(page)
fetch_list = set()
result = []
for title in re.findall(r'<a href=.*\s.*<span class="title">(.*)</span>', page):
result.append(title)
for postfix in re.findall(r'<a href="(\?start=.*?)"', page):
fetch_list.add(url + postfix.decode())
for url in fetch_list:
response = fetch_page(url)
page = response.content
for title in re.findall(r'<a href=.*\s.*<span class="title">(.*)</span>', page):
result.append(title)
for i, title in enumerate(result, 1):
#title = title.decode('utf-8').encode('utf-8')
print('{} {}'.format(i,title))
def main():
start=time()
parse(url)
end=time()
print('Cost {} seconds'.format((end - start)))
if __name__ == '__main__':
main()
requests爬取豆瓣top250 movie
最新推荐文章于 2024-04-23 11:36:40 发布