Datawhale爬虫 第五期 Day2
正则表达式:http://www.runoob.com/regexp/regexp-tutorial.html
import requests, re
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
def get_one_page(url,headers):
try:
response = requests.get(url, headers = headers)
if response.status_code == 200:
return response.text
except:
pass
def film_info(html):
pattern = re.compile(
'<li.*?item.*?pic.*?em.*?>(.*?)</em>.*?href="(.*?)".*?alt="(.*?)".*?src="(.*?)".*?info.*?bd">.*?<p.*?>(.*?)</p>.*?star">.*?v:average">(.*?)</span>.*?span>(.*?)</span>.*?inq">(.*?)</span>.*?</li>',
re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'top': item[0],
'film_url': item[1],
'film_name': item[2],
'film_img': item[3],
'film_info': item[4],
'film_star': item[5],
'film_num': item[6],
'film_quote': item[7]
}
def main(start):
url = 'https://movie.douban.com/top250?start={}'.format(start)
html = get_one_page(url, headers)
response = film_info(html)
for info in response:
print(info)
if __name__ == '__main__':
for x in range(11):
main(x * 25)