主文件:
# -*-coding:utf-8 -*- import urllib2 import urllib import re import time import requests import html_error2 import download_urls2 from bs4 import BeautifulSoup import urlparse # 定义一个类,上面先import调用我另外两个py文件,一个是用于捕捉错误然后重复请求,一个是用于爬取网页中新闻的URL class mainmethod(object): # 构造函数调用两个py文件中的类 def __init__(self): self.htmlerror = html_error2.html_errorSovler() self.htmlurls = download_urls2.load_urls() # 主函数首先先爬取网站的页面,然后在把里面的新闻的URL爬出来进行进一步的处理 def pachong(self,urll,nnn,s,p): # 这里有一个主页面的地址赋值给一个参数page,用来以后碰到哪些没有完整地址的URL可以用join方法把他们补全 page_url = 'https://www.gatra.com' # 这个s参数和p参数是调用这个函数的时候传进来的,这个方法不但是为了可以不断地循环往下爬,还可以当做一个flag,如果断掉的时候可以继续爬,不需要从头开始了 n = s # 还有一点值得一提的就是,这个n代表的是网页的id就是从第一个网页开始,这是第几个网页,num代表的是蕴含新闻的页数,也就是一页包含很多页的新闻,页面有个page接口,而这个num就是递增的page接口 nn = p # 这里我找到了它最大的page,也就是最后一页 while nn<=nnn: # 这里为什么要乘以7呢说起来很奇怪这个网页的page很奇怪,他是按照新闻的条数递增的,一页七个,于是就乘7 num = nn * 7 # 这里调用我获取URL的py文件中的方法,传过去data,id数和页数,返回response_url和内容 result,result_url = self.htmlurls.geturls(urll,num,n) urls = re.findall('<div class="catItemImageBlock-inner">.*?<a href="(.*?)"', result, re.S) # 利用循环把所有的URL遍历出来 for url in urls: url = urlparse.urljoin(page_url,url) # 这里我需要输出一下URL和页数看一下我爬到哪里,第几个 print str(n)+'\n'+str(nn)+'\n' print url+'\n' # 这里调用我获取URL的py文件中的方法,传过去data,id数和页数,返回response_url和内容 result1,response_url = self.htmlurls.getcontent(url,n) # 打开文件存入这个html with open('/home/qin/pachong/zy/www.gatra.com.html/html/'+str(n)+'.html','w') as fff: fff.write(result1) # 运用beautifulsoup soup = BeautifulSoup(result1,"html.parser") # 把这些提取出来的东西按照json格式写入 with open('/home/qin/pachong/zy/www.gatra.com.html/data/' + str(n) + '.json', 'a') as uuu: source_id = '65change a web to "https://www.gatra.com"' language = 'eng' request_url = url uuu.write('source_id :' + source_id + '\n' + 'language :' + language + '\n' +'request_url :' + request_url + '\n' + 'response_url :' + response_url.encode('utf-8','ignore') + '\n') # 获取分类 classification = soup.find('span', class_='category') uuu.write('classification:'+classification.a.get_text() + '\n') uuu.write('abstract:'+'\n') # 获取主题 title = soup.find('h2', class_='itemTitle') uuu.write('title:'+title.get_text().strip().encode('utf-8','ignore') + '\n') # 获取主体 bodys = soup.find_all('p') uuu.write('body:'+'\n') for body in bodys: uuu.write(body.get_text().encode('utf-8','ignore') + '\n') uuu.write('\n') pub_time = soup.find('span',class_='itemDateCreated') # 获取新闻时间按照格式来存储 pub_times = re.search('([A-Za-z]+), ([0-9]+) ([A-Za-z]+) ([0-9]+) ([0-9]+):([0-9]+)', pub_time.get_text()) uuu.write('pub_time:' + pub_times.group(4).encode('utf-8','ignore') + '-' + pub_times.group(3).encode('utf-8','ignore') + '-' + pub_times.group(1).encode('utf-8','ignore') + '-' + pub_times.group(2).encode('utf-8','ignore') + ' ' + pub_times.group(5).encode('utf-8','ignore') + ':' + pub_times.group(6).encode('utf-8','ignore') + ':00'+'\n') # 时间戳 cole_time = time.time() uuu.write('cole_time:'+str(cole_time)+'\n'+'\n') # 所有的外链如果是不全的我用网页的原URL补全 out_first_links = soup.find_all('a', href=re.compile('/.+')) uuu.write('out_links:'+'\n') for out_first_link in out_first_links: new_full_url = urlparse.urljoin(page_url,out_first_link['href']) uuu.write(new_full_url.encode('utf-8','ignore') + '\n') out_links = soup.find_all('a', href=re.compile('h.+')) for out_link in out_links: uuu.write(out_link['href'].encode('utf-8','ignore') + '\n') uuu.write('\n\n'+'images:' + '\n') images = soup.find_all('img') for image in images: uuu.write(image['src'].encode('utf-8','ignore') + '\n') n+=1 nn+=1 print str(n)+ '\n' return n lala = mainmethod() kkk = lala.pachong('https://www.gatra.com/politik/partai?start=',213,1,1) kkk = lala.pachong('https://www.gatra.com/politik/politik?start=',823,kkk,1) kkk = lala.pachong('https://www.gatra.com/politik/pemilu/kpu?start=',37,kkk,1) kkk = lala.pachong('https://www.gatra.com/politik/pemilu/bawaslu?start=',11,kkk,1) kkk = lala.pachong('https://www.gatra.com/politik/pemilu/pilkada?start=',292,kkk,1) kkk = lala.pachong('https://www.gatra.com/nusantara/nasional?start=',1411,kkk,1) kkk = lala.pachong('https://www.gatra.com/nusantara/jabodetabek?start=',499,kkk,1) kkk = lala.pachong('https://www.gatra.com/nusantara/sumatera?start=',169,kkk,1) kkk = lala.pachong('https://www.gatra.com/nusantara/jawa?start=',1002,kkk,1) kkk = lala.pachong('https://www.gatra.com/nusantara/kalimantan?start=',39,kkk,1) kkk = lala.pachong('https://www.gatra.com/nusantara/sulawesi?start=',53,kkk,1) kkk = lala.pachong('https://www.gatra.com/nusantara/bali-nusa-tenggara?start=',90,kkk,1) kkk = lala.pachong('https://www.gatra.com/nusantara/maluku-papua?start=',94,kkk,1) kkk = lala.pachong('https://www.gatra.com/internasional/amerika?start=',77,kkk,1) kkk = lala.pachong('https://www.gatra.com/internasional/eropa?start=',80,kkk,1) kkk = lala.pachong('https://www.gatra.com/internasional/timur-tengah?start=',67,kkk,1) kkk = lala.pachong('https://www.gatra.com/internasional/asia-oseania?start=',126,kkk,1) kkk = lala.pachong('https://www.gatra.com/ekonomi/makro?start=',323,kkk,1) kkk = lala.pachong('https://www.gatra.com/ekonomi/properti?start=',103,kkk,1) kkk = lala.pachong('https://www.gatra.com/ekonomi/finansial?start=',334,kkk,1) kkk = lala.pachong('https://www.gatra.com/ekonomi/industri?start=',518,kkk,1) kkk = lala.pachong('https://www.gatra.com/ekonomi/perdagangan?start=',185,kkk,1) kkk = lala.pachong('https://www.gatra.com/hukum?start=',2954,kkk,1) kkk = lala.pachong('https://www.gatra.com/life-health/sehat?start=',138,kkk,1) kkk = lala.pachong('https://www.gatra.com/life-health/intim?start=',4,kkk,1) kkk = lala.pachong('https://www.gatra.com/olahraga?start=',496,kkk,1) kkk = lala.pachong('https://www.gatra.com/iltek/gadget?start=',79,kkk,1) kkk = lala.pachong('https://www.gatra.com/iltek/telko?start=',67,kkk,1) kkk = lala.pachong('https://www.gatra.com/iltek/sains?start=',60,kkk,1) kkk = lala.pachong('https://www.gatra.com/iltek/internet?start=',162,kkk,1) kkk = lala.pachong('https://www.gatra.com/entertainment/apa-siapa?start=',599,kkk,1) kkk = lala.pachong('https://www.gatra.com/entertainment/musik?start=',351,kkk,1) kkk = lala.pachong('https://www.gatra.com/entertainment/film?start=',255,kkk,1) kkk = lala.pachong('https://www.gatra.com/entertainment/televisi?start=',83,kkk,1)
错误纠正文件:
# -*-coding:utf-8 -*- import html_downloader2 #出现错误的时候用递归反复发出请求 class html_errorSovler(object): # 构造函数调用了HTML下载器里面的类 def __init__(self): self.htmlDownloader=html_downloader2.HtmlDownloader() # 错误请求,重复请求,重复请求失败的话再调用自身进行递归 def errorRedo(self,url,cot): try: print 'RE:craw %d: %s' % (cot, url) html_cont,response_url=self.htmlDownloader.download(url) return html_cont,response_url except Exception, e: html_cont,response_url=self.errorRedo(url,cot) return html_cont,response_url
URL下载器:
# -*-coding:utf-8 -*- import requests import html_error2 class load_urls(object): # 构造函数调用捕捉错误文件中的类 def __init__(self): self.htmlerror=html_error2.html_errorSovler() # 传参拿到内容 def geturls(self,url,num,n): try: res = requests.get(url + str(num), headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0' }, proxies={ 'http': 'http://127.0.0.1:37867', 'https': 'https://127.0.0.1:37867' }) result = res.content response_url = res.url return result,response_url except Exception, e: result,response_url = self.htmlerror.errorRedo(url,n) return result,response_url # 下面方法和上面类似获取的是新闻页 def getcontent(self,url,num): try: res = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0' }, proxies={ 'http': 'http://127.0.0.1:37867', 'https': 'https://127.0.0.1:37867' }) result = res.content response_url = res.url return result,response_url except Exception, e: result,response_url = self.htmlerror.errorRedo(url,num) return result,response_url
html下载器:
# -*-coding:utf-8 -*- import requests class HtmlDownloader(object): # html下载器,传入两个参数一个是headers一个是代理,这里我用的是蓝灯的免费代理 def download(self,url): res = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0' }, proxies={ 'http': 'http://127.0.0.1:37867', 'https': 'https://127.0.0.1:37867' }) # 返回内容和response_URL result = res.content response_url = res.url return result,response_url