import requests,os from lxml import etree from urllib.parse import urljoin import re pattern = re.compile(r"《(.*?)》") url = 'https://www.dy2018.com/html/gndy/dyzz/index.html' response = requests.get(url) html_content = response.text root = etree.HTML(html_content) all_pages = root.xpath("//select[@name='select']/option/@value") f = open('小调网.csv','w',encoding = 'utf8') for page in all_pages: page = urljoin(url,page) response = requests.get(page) response.encoding = 'gbk' html_content = response.text root = etree.HTML(html_content) mov_name = root.xpath('//table/tr/td/b/a/@title') mov_url = root.xpath('//table/tr/td/b/a/@href') for mov_name,mov_url in zip(mov_name,mov_url): try: result = pattern.findall(mov_name)[0] href = urljoin(url,mov_url)
小调电影名称爬取
最新推荐文章于 2019-12-17 17:03:20 发布