- 版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/kun1280437633/article/details/80491184
'''
分析:
1. 爬取流程
地址:http://www.dingziys.com/movie/list.html?year=2018&cate=0&country=&rating=0&page=7&per-page=24
方式:get
参数:
year: 2018
cate: 0
country:
rating: 0
page: 1
per-page: 24
'''
import re
import requests
Debug = False
class DingzimovieSpider:
def __init__(self):
self.base_url = "http://www.dingziys.com/movie/list.html"
self.base_url1 = "http://www.dingziys.com"
def run(self):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
detail_div_pattern = re.compile(r'<div class="box-content-1 project-detail">(.*?)</div>', re.S)
# 定义详情页超链接提取
detail_link_pattern = re.compile(r'<a href="(.*)" target="_blank" title="(.*)"')
for i in range(5000):
params = {
'year': '2018',
'cate': '0',
'country': '',
'rating': '0',
'page': i,
'per - page': 24,
}
url = self.base_url
response = requests.get(url,params=params,headers=headers)
total_html = response.content.decode('utf-8')
detail_div_htmls = detail_div_pattern.findall(total_html)
for detail_div_html in detail_div_htmls:
detail_link_html = detail_link_pattern.findall(detail_div_html)[0]
name = detail_link_html[1]
url = self.base_url1 + detail_link_html[0]
print(name,url)
if Debug:breakif __name__ == '__main__':
spider = DingzimovieSpider()
spider.run()