# 简陋的自适应爬虫,需要参数: # 类Pa : { urlone : 搜索的第一页url , # pagenum : 需要爬取的页面数目 # } # 类方法getinfo_url : { signlist : 标签列表(需要提取的内容所在的标签嵌套顺序 , 如['div','a']), # classname : 与上面signlist一一对应 , 表示上标签的"class="的值 ,如['aa',None],"None"表示没有class # } # 类方法write_info : { h_sign : 详情页面,标题所在标签列表 如['div','h'] # h_name : 与上面一一对应,表示标签所对应的class值 理解同上 # c_sign : 内容的标签列表 # c_name :内容的标签列表所对应的class值 # } # class Pa: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Core/1.70.3676.400 QQBrowser/10.4.3505.400' } def __init__(self,urlone,pagenum): self.urlstart = urlone self.info_url = [] self.pagenum = pagenum # print('创建好类!') #提取详细内容的网页,第一个参数为标签(<a>,<div>) , 第二个参数为class的名字(class = 'aa' 中的'aa') def getinfo_url(self,signlist=[],classname=[]): # print('getinfo_url方法!') url_split = self.urlstart.split('&p=1') for pageindex in range(1,self.pagenum+1): try: print('正在爬取页面url pageindex:',pageindex) #初始搜索url # print(url_split) url1 = url_split[0] + '&p={}'.format(pageindex) + url_split[1] # print('url1:',url1) res = requests.get(url1,headers=headers) res.encoding='utf-8' # print(res.text) page = etree.HTML(res.text) #具体文章url xpathstr = '' for index in range(len(signlist)): if classname[index] == None: xpathstr += '/{}' .format(signlist[index]) else: xpathstr += '/{}[@class = "{}"]'.format(signlist[index], classname[index]) # print(index,xpathstr) xpathstr = '/' + xpathstr + '//@href' # print('xpathstr:',xpathstr) dataurl = page.xpath(xpathstr) # print('提取的dataurl:',dataurl) if dataurl == []: print('最后一页停止') break dataurl = list(set(dataurl)) # print('index,原始dataurl:',index, dataurl) for everurl_index in range(len(dataurl)): # print('everurl_index:',everurl_index) # print('测试:',dataurl[everurl_index]) # 如果是短链接,缺少主页url if dataurl[everurl_index][0] == '/': url_first = re.findall(r"https://(.*?)/", self.urlstart) if url_first == []: url_first = re.findall(r"http://(.*?)/", self.urlstart) dataurl[everurl_index] = 'http://' + url_first[0] + dataurl[everurl_index] else: dataurl[everurl_index] = 'https://' + url_first[0] + dataurl[everurl_index] # print('加工后dataurl:', dataurl) self.info_url.extend(dataurl) # print('info_url长度:',len(self.info_url)) except: print('爬取url异常:',pageindex) #将详细内容网页中的content提取出来,参数分别为 标题的标签 ,标题标签class的名字(同上), 内容的标签,内容标签中类的名字 def write_info(self,path = './',h_sign=[],h_name=[],c_sign=[],c_name=[]): # print('write函数') if self.info_url == []: print('没有可读取的url,请先运行"getinfo_url"函数') else: index = 0 for url_i in self.info_url: try: content = '' index += 1 # print('正在写入数据:',index) # print(url_i) res = requests.get(url_i, headers=headers) res.encoding = 'utf-8' # print(res.text) page = etree.HTML(res.text) #提取header的xpath h_str = '/' for h_index in range(len(h_sign)): if h_name[h_index] == None: h_str += '/{}'.format(h_sign[h_index]) else: h_str += '/{}[@class = "{}"]'.format(h_sign[h_index],h_name[h_index]) h_str = h_str + '//text()' # print('hstr:',h_str) h_element = page.xpath(h_str) # print('h_el:',h_element) headtext = h_element[0] content += h_element[0] + '\n' # print('head:',h_element[0]) #提取content的xpath c_str = '/' for c_index in range(len(c_sign)): if c_name[c_index] == None: c_str += '/{}'.format(c_sign[c_index]) else: c_str += '/{}[@class = "{}"]'.format(c_sign[c_index], c_name[c_index]) c_str2 = c_str + '//text()' if page.xpath(c_str2)==[]: c_str = c_str + '//text()' else: c_str = c_str2 # print('c_str:',c_str) c_element = page.xpath(c_str) # print('c_ele:',c_element) # print('flag:',c_element == []) for i in c_element: content += '\n' content += '\t'+ i # print('content:',content) filename = '{}/{}_{}.txt'.format(path,index,headtext.strip()) # print('filename:',filename) with open(filename,'w+',encoding='utf-8') as f: f.write(content) print('写入成功:',index) except: print('写入异常:',index)
参数导入json:
twofile = { 'sum':['pagenum','test_url1','test_sign','test_classname','test_h_sign','test_h_name','test_c_sign', 'test_c_name','path'], 'pagenum':500, 'test_url':'http://sousuo.gov.cn/s.htm?q=%E4%BF%A1%E6%81%AF%E5%AE%89%E5%85%A8&n=10&p=1&t=govall&timetype=timeqb&mintime=&maxtime=&sort=&sortType=1&nocorrect=', 'test_sign':['h3','a'], 'test_classname':['res-title',None], 'test_h_sign':['div','h1'], 'test_h_name':['article oneColumn pub_border',None], 'test_c_sign':['div','p'], 'test_c_name':['pages_content',None], 'path' : 'data3' } #转成json格式 twofile = json.dumps(twofile) #写入文件 with open('canshu/twofile.json','w+') as f: f.write(twofile)
调用函数:
#调用函数 def reslut(): dir = os.listdir('canshu') for file in dir: print(file) filepath = 'canshu/'+file with open(filepath,'r') as f: canshu = f.read() canshu = json.loads(canshu) # print(canshu) test_url = canshu['test_url'] #提取函数getinfo_url的参数设定 test_sign = canshu['test_sign'] test_classname = canshu['test_classname'] #写入函数write_info的参数 test_h_sign = canshu['test_h_sign'] test_h_name = canshu['test_h_name'] test_c_sign = canshu['test_c_sign'] test_c_name = canshu['test_c_name'] path = canshu['path'] pagenum = canshu['pagenum'] # print('test_sign:',test_sign,type(test_sign)) # print('test_classname:', test_classname, type(test_classname)) test1 = Pa(test_url , pagenum) test1.getinfo_url(test_sign ,test_classname) test1.write_info(path ,test_h_sign ,test_h_name ,test_c_sign ,test_c_name) reslut()