import requests
from lxml import etree
import time
import random
import pandas as pd
def get_detail_urls(url,HEADERS):
resp = requests.get(url,headers=HEADERS)
text = resp.content.decode('gbk',errors='ignore')
html = etree.HTML(text)
detail_url = html.xpath("//table[@class='tbspan']//a/@href")
detail_url_1 = html.xpath("//table[@class='tbspan']//a/text()")
detail_url_2 = []
for i in range(len(detail_url)):
detail_url_2.append('http://www.ygdy8.net' + detail_url[i])
data = pd.DataFrame({'film_name':detail_url_1,'detail_url':detail_url_2})
data.to_csv('film.csv', index=False, sep=';', mode='a', header=False)
HEADERS = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
for i in range(1,100):
url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'.format(i)
time.sleep(random.random())
print('第{}页'.format(i))
get_detail_urls(url, HEADERS)
36.python爬虫 电影
最新推荐文章于 2024-09-12 08:09:57 发布
1万+

被折叠的 条评论
为什么被折叠?



