import csv #导入CSV模块写入 import re #导入正则包 import random #导入随机生成包 from ua_info import ua_list #导入构建的代理池 from urllib import request #导入urllib下request #构建一个类 class Dyttspider(object): #定义常量 def __init__(self): self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html' # self.url1 = 'https://www.dytt8.net/{}' #正则解析函数 def parse_html(self, html): bds = '<table width="100%".*?<td width="5%".*?class="ulink">(.*?)</a>.*?</table>' pattern = re.compile(bds,re.S) l_list = pattern.findall(html) for i in l_list: print('movie:%s \n' %i) #print(l_list) return l_list #请求函数,并返回Unicode字符串 def get_html(self, url): headers = {'User-Agent': random.choice(ua_list)} req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read().decode('gb2312', 'ignore') print('OK') return html #储存函数,将数据储存至目标文件 def save_file(self,l_list): with open('maoyan.csv', 'w', newline='', encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(l_list) #主函数,控制整体逻辑 def run(self): url = self.url html1 = self.get_html(url) self.parse_html(html1) #self.save_file(jiexi) #以脚本启动 if __name__ == "__main__": spider = Dyttspider() spider.run()
二级页面下载地址爬取
于 2022-09-21 12:09:40 首次发布