""" Created on Fri Jan 19 18:58:41 2018 人民网新闻爬虫 @author: gzs10227 """ import sys stderr = sys.stderr stdout = sys.stdout reload(sys) sys.setdefaultencoding('utf8') sys.stderr = stderr sys.stdout = stdout import urllib2,urllib urllib.getproxies_registry = lambda: {} import requests from lxml import etree import re,time,datetime import os base_path = u'C:/Users/gzs10227/Desktop/廖庆豪' TYPE_DICT = {} def open_url(url): time.sleep(0.5) headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} response = requests.get(url,headers = headers) response.encoding = 'utf-8' return response.content url = 'http://www.people.com.cn/' html = open_url(url) web_data = etree.HTML(html) links = web_data.xpath(r'//div[@class="w1000"]//span/a/@href') types = web_data.xpath(r'//div[@class="w1000"]//span/a/text()') for i in range(len(links)): link = links[i] key = link.replace('http://','').replace('.people.com.cn/','') result_path = '%s/%s' % (base_path, types[i]) if not os.path.exists(result_path): os.makedirs(result_path) TYPE_DICT[key] = types[i] def get_newtype_href(link): print link html = open_url(link) type_hrefs = re.findall(re.compile(r'href="(.*?)"'),html) newtype = link.replace('http://','').replace('.people.com.cn/','') type_hrefs = [i for i in type_hrefs if newtype in i and 'css' not in i and link != i] index_hrefs = list(set([i for i in type_hrefs if i.endswith('index.html')])) content_urls = list(set(type_hrefs) - set(index_hrefs)) for url in index_hrefs: print url html = open_url(link) curls = re.findall(re.compile(r'href="(.*?)"'),html) clear_url = [] for c in curls: if 'n1' not in c or 'css' in c: continue else: if c.startswith('/n1'): c = link[:-1] + c clear_url.append(c) else: clear_url.append(c) content_urls.extend(clear_url) content_urls = list(set(content_urls)) return content_urls links = [i for i in links if 'renshi' not in i and 'news' not in i] result_link = map(get_newtype_href,links) result_links = [] for i in result_link: for j in i: if j.endswith('.html') and j.startswith('http') and 'n1' in j: result_links.append(j.replace(' ','').replace(' ','').replace('\t','')) result_links = list(set(result_links)) def get_content(href): key = re.findall(re.compile(r'http://(.*?).people.com'),href)[0] html = open_url(href) web_data = etree.HTML(html) print TYPE_DICT[key] result_path = '%s/%s/' % (base_path, TYPE_DICT[key]) try: title = web_data.xpath('//div[@class="clearfix w1000_320 text_title"]//h1/text()')[0] content = web_data.xpath('//div[@class="box_con"]//p//text()') contents = '' for c in content: contents = contents + c except: title = '' contents = '' print title filename = str(int(time.time() * 1000)) + '.txt' with open(result_path + filename,'w') as f: f.writelines(title) f.writelines(contents) map(get_content,result_links)
毕业设计之数据获取【人民网】
最新推荐文章于 2024-10-02 12:26:07 发布