爬虫训练二
import re
import requests
from lxml import etree
import global_var
import time
import xlwt
##申明类
class title:
contents = []
def get_domain(self):
domain = []
with open('domain.txt') as f:
urls = f.readlines()
for url in urls:
domain.append(url)
domain.append('++')
return domain
def get_content(self,url):
content = requests.get(url=url,headers=global_var.headers)
if content.status_code != 200:
print("请求失败")
self.tree = etree.HTML(content.text)
mh_name = self.tree.xpath('//span[@class="mh-name"]')
return mh_name[0].text
def get_article__url(self,url):
##获取图片新闻URL链接
pic_News = self.tree.xpath('//div[@class="focusBox focusWrap org-slide "]/a[@class="more"]')
pic_News_url = url+pic_News[0].attrib['href']
return pic_News_url
def get_content_url(self,url):
##获取各栏目URL链接
boards = self.tree.xpath('//div[@class="hd"]/ul/li')
for board in boards:
content = {}
content['contents_url'] = url+board.xpath('a')[0].attrib['href']
content['contents_name'] = board.xpath('a')[0].text.strip()
if len(board.xpath('a')[0].attrib['href']) == 12:
continue
self.contents.append(content)
return self.contents
def get_article_content(self,domain_url,article_url):
##获取各栏目详细内容
resp = requests.get(article_url,headers=global_var.headers)
columnGuid = re.findall(r'columnGuid\s\s\=\s\"(.*)\"',resp.text)[0]
mhScope = re.findall(r'mhScope\s\=\s\"(.*)\"',resp.text)[0]
list_url = domain_url + global_var.article_url.format(columnGuid,mhScope)
list_resp = requests.get(url = list_url,headers=global_var.headers)
s = list_resp.text.encode('utf-8').decode('unicode_escape')
last_Publish_time = re.findall(r'publishTime\"\:\"([\d]{4}\-[\d]{2}\-[\d]{2})',s)
last_Publesh_title = re.findall(r'title\"\:\"([\u4e00-\u9fa5|\d|\:\《\》\、\“\”\-\+]{0,})',s)
content = {}
content['last_Publesh_title'] = last_Publesh_title[0]
content['last_Publish_time'] = last_Publish_time[0]
return(content)
if __name__ == '__main__':
workbook = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = workbook.add_sheet('单位',cell_overwrite_ok=True)
##写入Excel中
sheet.col(0).width = 256*20
check = title()
domain_url = check.get_domain()
for index in range(0,len(domain_url),2):
url = domain_url[index].replace('\n','')##domain_url[index]获取当前列表位置
mh_name = check.get_content(url)
article_url = check.get_article__url(url)
pic_news_update = check.get_article_content(url,article_url)['last_Publish_time']
pic_news_title = check.get_article_content(url,article_url)['last_Publesh_title']
print(mh_name)
print('图片新闻'+':'+pic_news_update+':'+pic_news_title)
sheet.write(index,0,mh_name)
sheet.write(index,2,'图片新闻幻灯片')
sheet.write(index+1,2,pic_news_update)
content_urls = check.get_content_url(url)
for content_url in content_urls:
content_name = content_url['contents_name']
content_update = check.get_article_content(url,content_url['contents_url'])['last_Publish_time']
content_title = check.get_article_content(url,content_url['contents_url'])['last_Publesh_title']
# print(content_url['contents_name']+':'+check.get_article_content(content_url['contents_url'])['last_Publish_time']+' :'+check.get_article_content(content_url['contents_url'])['last_Publesh_title'])
print(content_name+':'+content_update+' :'+content_title)
sheet.write(index,content_urls.index(content_url)+4,content_name)
sheet.write(index+1,content_urls.index(content_url)+4,content_update)
content_urls.clear()
# xlwt.Formula(row,col,'HYPERLINK("http://www.baidu.com","mh_name")') 添加超链接
workbook.save(r"栏目更新检查.xls")