财新网、东方财富、和讯网相关企业新闻搜索采集
'''财新网采集搜索新闻,该网站许多内容需要付费,所以需要登录''' from urllib import request, parse from bs4 import BeautifulSoup from random import choice,random import time #LOGIN_URL = 'http://www.caixin.com/#' #values = {'user': '974201953@qq.com','password':'974201953'} def get_client(): user_agent = ["Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] return choice(user_agent) #得到每页文章的主要内容 def get_content(article_url): req = request.Request(article_url) req.add_header('User-Agent', get_client()) req.add_header('Connection', 'keep-alive') resp = request.urlopen(req) #print(resp.read().decode('utf-8')) soup = BeautifulSoup(resp.read(),'lxml') textbox = soup.select('div[class="textbox"]')[0] content = textbox.select('div[id="Main_Content_Val"]')[0].text return content #获得一篇文章的一般信息 def get_info(url, name,path='F:/caixin_news'): req = request.Request(url) req.add_header('User-Agent', get_client()) req.add_header('Connection', 'keep-alive') resp = request.urlopen(req) soup = BeautifulSoup(resp.read(),'lxml') articles = soup.select('div[class="searchtext"]')[0].select('ul')[0].select('li') for article in articles: try: article_title = article.select('div[class="searchxt"]')[0].select('a')[0].text article_url = article.select('div[class="searchxt"]')[0].select('a')[0].attrs['href'] article_time = article.select('div[class="searchxt"]')[0].select('span')[0].text article_content = get_content(article_url) article_path = path + '/' + name + '_' + str(time.time()).replace('.', '') + '.txt' with open(article_path, mode='a') as f: f.write('标题:' + article_title + '\n') f.write('文章链接:' + article_url + '\n') f.write('时间:' + article_time + '\n') f.write(article_content) except Exception as e: print(e) continue #获得每个搜索有多少页面 def get_page(url,name): req = request.Request(url) req.add_header('User-Agent', get_client()) req.add_header('Connection', 'keep-alive') resp = request.urlopen(req) soup = BeautifulSoup(resp.read(),'lxml') pagingIndex = soup.select('span[id="pagingIndex"]')[0].text pageNum = int(pagingIndex.split('/')[1]) for i in range(1,pageNum+1): url = 'http://search.caixin.com/search/search.jsp?special=false' \ '&keyword=%s&page=%s'%(parse.quote(name),i) get_info(url,name) if i %10 == 0: time.sleep(random(2,5)) print('已获得%s,第%s页内容'%(name,i)) #可以据此作为断点重连依据 '''=================开始处,首先创建“F:/caixin_news”文件夹==============''' names = ['巨人网络','分众传媒' ] for name in names: print('开始获取 %s 内容'%name ) url ='http://search.caixin.com/search/search.jsp?special=false&keyword=%s'%parse.quote(name) try: get_page(url,name) except Exception as e: print(e) print('-----发生错误------') continue print('获取%s内容结束'%name) time.sleep(random(10,20)) # url = 'http://companies.caixin.com/2017-11-06/101166567.html' # get_content(url)
'''东方财富:采集新闻、公告,存在问题:页码无法获得''' from urllib import request from bs4 import BeautifulSoup import time from random import choice,random def get_client(): user_agent = ["Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] return choice(user_agent) #获得公告内容 def get_notice_content(article_url): req = request.Request(article_url) req.add_header('User-Agent', get_client()) resp = request.urlopen(req) soup = BeautifulSoup(resp.read(), 'lxml') return soup.select('div[id="zwconbody"]')[0].get_text() #获得公告信息 def get_notice(url,name,path ='F:/eastmoney_notice'): req = request.Request(url) req.add_header('User-Agent', get_client()) resp = request.urlopen(req) soup = BeautifulSoup(resp.read(), 'lxml') articles = soup.select('div[class^="articleh"]') m = len(articles) #m = 80 for article in articles: try: article_title = article.select('span[class="l3"]')[0].get_text() article_url = article.select('span[class="l3"]')[0].select('a')[0].attrs['href'] article_time = article.select('span[class="l5"]')[0].text article_content = get_notice_content('http://guba.eastmoney.com'+article_url) article_path = path+'/'+name+"_"+str(time.time()).replace('.','')+'.txt' with open(article_path,'a',encoding='utf-8') as f: f.write("标题: "+article_title+'\n') f.write("文章链接: "+'http://guba.eastmoney.com'+article_url+'\n') f.write('时间: '+article_time+'\n') f.write(article_content) except Exception as e: print(e) continue return m # 获得文章内容 def get_news_content(article_url): req = request.Request(article_url) req.add_header('User-Agent', get_client()) resp = request.urlopen(req) soup = BeautifulSoup(resp.read(), 'lxml') return soup.select('div[id="zw_body"]')[0].get_text() # 获得新闻信息 def get_news(url,name,path ='F:/eastmoney_news'): req = request.Request(url) req.add_header('User-Agent', get_client()) resp = request.urlopen(req) soup = BeautifulSoup(resp.read(), 'lxml') articles = soup.select('div[class^="articleh"]') m = len(articles) #m = 80 for article in articles: try: article_title = article.select('span[class="l3"]')[0].get_text() article_url = article.select('span[class="l3"]')[0].select('a')[0].attrs['href'] # get :/news,300104,720500092.html ;需要 http://guba.eastmoney.com/news,300104,725355151.html article_time = article.select('span[class="l5"]')[0].text article_content = get_news_content('http://guba.eastmoney.com'+article_url) article_path = path + '/' + name + "_" + str(time.time()).replace('.', '') + '.txt' with open(article_path,'a',encoding='utf-8') as f: f.write("标题: "+article_title+'\n') f.write("文章链接: "+'http://guba.eastmoney.com'+article_url+'\n') f.write('时间: '+article_time+'\n') f.write(article_content) except Exception as e: print(e) continue return m '''=========================开始处,首先创建“F:/eastmoney_notice”和“F:/eastmoney_news”文件夹============================''' names = ['002558','002027'] for name in names: print('开始获取: %s'%name) page = 1 news_url = 'http://guba.eastmoney.com/list,%s,1,f_%s.html'%(name,str(page)) #这是新闻地址 while get_news(url = news_url, name = name) == 80: print("已获得 %s 页新闻内容" % str(page)) page += 1 news_url = 'http://guba.eastmoney.com/list,%s,1,f_%s.html'%(name,str(page)) if page % 10 == 0 : time.sleep(random(2,5)) time.sleep(random(10,20)) print('-------------------新闻获取结束,开始获取公告--------------') page = 1 names = ['002558','002027','300104','002555','002624','300418','002174','002425' ,'002123','300431','300052','300043','600242','002188','002131','000676','300031','002619','600652'] for name in names: print('开始获取: %s' % name) notice_url = 'http://guba.eastmoney.com/list,%s,3,f_%s.html'%(name,str(page)) #这是通知地址 while get_notice(url = notice_url, name = name) == 80: print("已获得%s 页通知内容" % str(page)) page += 1 news_url = 'http://guba.eastmoney.com/list,%s,3,f_%s.html'%(name,str(page)) if page % 10 == 0: time.sleep(random(2,5)) time.sleep(random(10,20)) print('-------------------公告获取结束--------------')
#!/usr/bin/python # -*- coding: utf-8 -*- '''从和讯网采集相关新闻,和讯新闻固定显示十页内容''' from bs4 import BeautifulSoup from urllib import request,parse import time from random import choice,random # 主要函数 获取标题、链接、时间 def get_client(): user_agent = ["Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] return choice(user_agent) def get_content(article_url): req = request.Request(article_url) req.add_header('User-Agent', get_client()) resp = request.urlopen(req) soup = BeautifulSoup(resp.read(), 'lxml') return soup.select('div[class="art_contextBox"]')[0].text #获得必要信息 def get_info(url, name,path = 'F:/hexun_news'): req = request.Request(url) req.add_header('User-Agent',get_client()) resp = request.urlopen(req) soup = BeautifulSoup(resp.read(),'lxml') newss = soup.select('div[class^="newslist"]') for news in newss: try: article_title = news.select('span[class="breakdiv"]')[0].text article_url = news.select('span[class="breakdiv"]')[0].select('a')[0].attrs['href'] article_time = news.select('div[class="news-l-t"]')[0].select('span')[1].text article_content = get_content(article_url) article_path = path + '/' + name + '_' + str(time.time()).replace('.', '') + '.txt' with open(article_path, 'a', encoding='utf-8') as f: f.write("标题: " + article_title + '\n') f.write("文章链接:" + article_url + '\n') f.write("时间:" + article_time + '\n') f.write(article_content) except Exception as e: print(e) continue '''================开始处,首先创建“F:/hexun_news”文件夹===========''' names = ['巨人网络',] for name in names: for i in range(1,11): byteName = name.encode('gbk') quoteName = parse.quote(byteName) url = r'http://news.search.hexun.com/news?key=%s&t=all&s=0&f=0&page=%s'%(quoteName,i) try: get_info(url=url,name=name) except: continue print('已经获得%s数据'%name) time.sleep(random(5,10))