源码
"""
Version 1.1.0
Author lkk
Email lkk199404@163.com
date 2018-11-25 18:39
DESC 电影天堂
"""
# https://www.dy2018.com/
from urllib import request
import time
from lxml import html
from fake_useragent import UserAgent
def target_data(url):
ua = UserAgent()
headers = {
'User-agent': ua.random
}
start_url = request.Request(url, headers=headers)
response = request.urlopen(start_url)
data = response.read()
encoding = 'gb2312'
data_info = data.decode(encoding, 'ignore')
# soup = BeautifulSoup(data_info, 'lxml')
docs = html.fromstring(data_info)
return docs
def core(url):
docs = target_data(url)
links = docs.xpath("//tr[2]/td[2]/b/a[@class='ulink'][2]/@href")
return links
def get_data():
list_urls = core(url)
base_url = 'https://www.dy2018.com'
for i in list_urls:
last_url = base_url + i
docs = target_data(last_url)
name = docs.xpath("//div[@class='co_area2']/div[@class='title_all']/h1/text()")
score = docs.xpath("ul/div[@class='position']/span[1]")
date = docs.xpath("//div[@class='position']/span[@class='updatetime']/text()")
play_date = docs.xpath("//div[@id='Zoom']/p[9]/text()")
classify = docs.xpath("//div[@id='Zoom']/p[6]/text()")
abstract = docs.xpath("//div[@id='Zoom']/p[31]/text()")
download_links = docs.xpath("//table[1]//tr/td/anchor/a")
print(name, score[0].xpath("string(.)"), date, play_date, classify, abstract, download_links, last_url)
# TODO 数据持久化
if __name__ == '__main__':
urls = ['https://www.dy2018.com/3/', 'https://www.dy2018.com/2/',
'https://www.dy2018.com/0/', 'https://www.dy2018.com/1/',
'https://www.dy2018.com/4/', 'https://www.dy2018.com/8/',
'https://www.dy2018.com/5/', 'https://www.dy2018.com/7/',
'https://www.dy2018.com/15/', 'https://www.dy2018.com/14/',
'https://www.dy2018.com/html/tv/hytv/index.html', 'https://www.dy2018.com/html/tv/oumeitv/index.html',
'https://www.dy2018.com/html/tv/rihantv/index.html', 'https://www.dy2018.com/html/zongyi2013/index.html',
'https://www.dy2018.com/html/dongman/index.html',
]
for i in urls:
url = i
print(url)
base_url = 'https://www.dy2018.com'
docs = target_data(url)
info = core(url)
link_list = core(url)
get_data()
while True:
try:
next_page = docs.xpath("//div[@class='x']/p/a[text()='下一页']/@href")[0]
if len(next_page) > 0:
next_url = base_url + next_page
print(next_url)
core(next_url)
url = next_url
time.sleep(5)
get_data()
docs = target_data(next_url)
else:
break
except Exception as e:
print(e)
break