完成爬虫

最新推荐文章于 2024-07-05 17:19:57 发布

chuyijian7784

最新推荐文章于 2024-07-05 17:19:57 发布

阅读量301

点赞数

文章标签：爬虫 python 操作系统

原文链接：https://my.oschina.net/u/3673704/blog/1570220

版权

财新网、东方财富、和讯网相关企业新闻搜索采集

'''财新网采集搜索新闻，该网站许多内容需要付费，所以需要登录'''

from urllib import request, parse
from bs4 import BeautifulSoup
from random import choice,random
import time

#LOGIN_URL = 'http://www.caixin.com/#'
#values = {'user': '974201953@qq.com','password':'974201953'}

def get_client():
    user_agent = ["Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
                    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36",
                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
                    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
                    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
                    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
                    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
                    ]
    return choice(user_agent)

#得到每页文章的主要内容
def get_content(article_url):

    req = request.Request(article_url)
    req.add_header('User-Agent', get_client())
    req.add_header('Connection', 'keep-alive')
    resp = request.urlopen(req)
    #print(resp.read().decode('utf-8'))
    soup = BeautifulSoup(resp.read(),'lxml')
    textbox = soup.select('div[class="textbox"]')[0]
    content = textbox.select('div[id="Main_Content_Val"]')[0].text

    return content


#获得一篇文章的一般信息
def get_info(url, name,path='F:/caixin_news'):

    req = request.Request(url)
    req.add_header('User-Agent', get_client())
    req.add_header('Connection', 'keep-alive')
    resp = request.urlopen(req)

    soup = BeautifulSoup(resp.read(),'lxml')
    articles = soup.select('div[class="searchtext"]')[0].select('ul')[0].select('li')
    for article in articles:
        try:
            article_title = article.select('div[class="searchxt"]')[0].select('a')[0].text
            article_url = article.select('div[class="searchxt"]')[0].select('a')[0].attrs['href']
            article_time = article.select('div[class="searchxt"]')[0].select('span')[0].text
            article_content = get_content(article_url)

            article_path = path + '/' + name + '_' + str(time.time()).replace('.', '') + '.txt'
            with open(article_path, mode='a') as f:
                f.write('标题：' + article_title + '\n')
                f.write('文章链接：' + article_url + '\n')
                f.write('时间：' + article_time + '\n')
                f.write(article_content)
        except Exception as e:
            print(e)
            continue

#获得每个搜索有多少页面
def get_page(url,name):

    req = request.Request(url)
    req.add_header('User-Agent', get_client())
    req.add_header('Connection', 'keep-alive')
    resp = request.urlopen(req)

    soup = BeautifulSoup(resp.read(),'lxml')
    pagingIndex = soup.select('span[id="pagingIndex"]')[0].text
    pageNum = int(pagingIndex.split('/')[1])
    for i in range(1,pageNum+1):
        url = 'http://search.caixin.com/search/search.jsp?special=false' \
              '&keyword=%s&page=%s'%(parse.quote(name),i)
        get_info(url,name)
        if i %10 == 0:
            time.sleep(random(2,5))
        print('已获得%s,第%s页内容'%(name,i)) #可以据此作为断点重连依据

'''=================开始处，首先创建“F:/caixin_news”文件夹=============='''

names = ['巨人网络','分众传媒'         ]

for name in names:

    print('开始获取 %s 内容'%name )

    url ='http://search.caixin.com/search/search.jsp?special=false&keyword=%s'%parse.quote(name)
    try:
        get_page(url,name)
    except Exception as e:
        print(e)
        print('-----发生错误------')
        continue

    print('获取%s内容结束'%name)
    time.sleep(random(10,20))
# url = 'http://companies.caixin.com/2017-11-06/101166567.html'
# get_content(url)

'''东方财富：采集新闻、公告,存在问题：页码无法获得'''

from urllib import request
from bs4 import BeautifulSoup
import time
from random import choice,random


def get_client():
    user_agent = ["Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
                    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36",
                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
                    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
                    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
                    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
                    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
                    ]
    return choice(user_agent)

#获得公告内容
def get_notice_content(article_url):

    req = request.Request(article_url)
    req.add_header('User-Agent', get_client())
    resp = request.urlopen(req)
    soup = BeautifulSoup(resp.read(), 'lxml')

    return soup.select('div[id="zwconbody"]')[0].get_text()


#获得公告信息
def get_notice(url,name,path ='F:/eastmoney_notice'):

    req = request.Request(url)
    req.add_header('User-Agent', get_client())
    resp = request.urlopen(req)
    soup = BeautifulSoup(resp.read(), 'lxml')

    articles = soup.select('div[class^="articleh"]')
    m = len(articles)  #m = 80

    for article in articles:
        try:
            article_title = article.select('span[class="l3"]')[0].get_text()
            article_url = article.select('span[class="l3"]')[0].select('a')[0].attrs['href']
            article_time = article.select('span[class="l5"]')[0].text
            article_content = get_notice_content('http://guba.eastmoney.com'+article_url)
            article_path = path+'/'+name+"_"+str(time.time()).replace('.','')+'.txt'

            with open(article_path,'a',encoding='utf-8') as f:
                f.write("标题： "+article_title+'\n')
                f.write("文章链接： "+'http://guba.eastmoney.com'+article_url+'\n')
                f.write('时间： '+article_time+'\n')
                f.write(article_content)
        except Exception as e:
            print(e)
            continue
    return m


# 获得文章内容
def get_news_content(article_url):

    req = request.Request(article_url)
    req.add_header('User-Agent', get_client())
    resp = request.urlopen(req)
    soup = BeautifulSoup(resp.read(), 'lxml')

    return soup.select('div[id="zw_body"]')[0].get_text()

# 获得新闻信息
def get_news(url,name,path ='F:/eastmoney_news'):

    req = request.Request(url)
    req.add_header('User-Agent', get_client())
    resp = request.urlopen(req)
    soup = BeautifulSoup(resp.read(), 'lxml')

    articles = soup.select('div[class^="articleh"]')
    m = len(articles)  #m = 80

    for article in articles:
        try:
            article_title = article.select('span[class="l3"]')[0].get_text()
            article_url = article.select('span[class="l3"]')[0].select('a')[0].attrs['href']
            # get :/news,300104,720500092.html  ；需要 http://guba.eastmoney.com/news,300104,725355151.html
            article_time = article.select('span[class="l5"]')[0].text
            article_content = get_news_content('http://guba.eastmoney.com'+article_url)
            article_path = path + '/' + name + "_" + str(time.time()).replace('.', '') + '.txt'

            with open(article_path,'a',encoding='utf-8') as f:
                f.write("标题： "+article_title+'\n')
                f.write("文章链接： "+'http://guba.eastmoney.com'+article_url+'\n')
                f.write('时间： '+article_time+'\n')
                f.write(article_content)
        except Exception as e:
            print(e)
            continue

    return m


'''=========================开始处，首先创建“F:/eastmoney_notice”和“F:/eastmoney_news”文件夹============================'''

names = ['002558','002027']

for name in names:

    print('开始获取: %s'%name)
    page = 1
    news_url = 'http://guba.eastmoney.com/list,%s,1,f_%s.html'%(name,str(page)) #这是新闻地址

    while get_news(url = news_url, name = name) == 80:

        print("已获得 %s 页新闻内容" % str(page))
        page += 1
        news_url = 'http://guba.eastmoney.com/list,%s,1,f_%s.html'%(name,str(page))
        if page % 10 == 0 :
            time.sleep(random(2,5))

    time.sleep(random(10,20))
print('-------------------新闻获取结束，开始获取公告--------------')

page = 1
names = ['002558','002027','300104','002555','002624','300418','002174','002425'
    ,'002123','300431','300052','300043','600242','002188','002131','000676','300031','002619','600652']
for name in names:
    print('开始获取: %s' % name)
    notice_url = 'http://guba.eastmoney.com/list,%s,3,f_%s.html'%(name,str(page)) #这是通知地址
    while get_notice(url = notice_url, name = name) == 80:

        print("已获得%s 页通知内容" % str(page))
        page += 1
        news_url = 'http://guba.eastmoney.com/list,%s,3,f_%s.html'%(name,str(page))
        if page % 10 == 0:
            time.sleep(random(2,5))
    time.sleep(random(10,20))
print('-------------------公告获取结束--------------')

#!/usr/bin/python
# -*- coding: utf-8 -*-
'''从和讯网采集相关新闻，和讯新闻固定显示十页内容'''


from bs4 import BeautifulSoup
from urllib import request,parse
import time
from random import choice,random
# 主要函数 获取标题、链接、时间

def get_client():
    user_agent = ["Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
                    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36",
                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
                    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
                    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
                    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
                    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
                    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
                    ]
    return choice(user_agent)

def get_content(article_url):
    req = request.Request(article_url)
    req.add_header('User-Agent', get_client())
    resp = request.urlopen(req)
    soup = BeautifulSoup(resp.read(), 'lxml')

    return soup.select('div[class="art_contextBox"]')[0].text

#获得必要信息
def get_info(url, name,path = 'F:/hexun_news'):

    req = request.Request(url)
    req.add_header('User-Agent',get_client())
    resp = request.urlopen(req)
    soup = BeautifulSoup(resp.read(),'lxml')

    newss = soup.select('div[class^="newslist"]')
    for news in newss:
        try:
            article_title = news.select('span[class="breakdiv"]')[0].text
            article_url = news.select('span[class="breakdiv"]')[0].select('a')[0].attrs['href']
            article_time = news.select('div[class="news-l-t"]')[0].select('span')[1].text
            article_content = get_content(article_url)

            article_path = path + '/' + name + '_' + str(time.time()).replace('.', '') + '.txt'
            with open(article_path, 'a', encoding='utf-8') as f:
                f.write("标题： " + article_title + '\n')
                f.write("文章链接：" + article_url + '\n')
                f.write("时间：" + article_time + '\n')
                f.write(article_content)
        except Exception as e:
            print(e)
            continue




'''================开始处，首先创建“F:/hexun_news”文件夹==========='''

names = ['巨人网络',]

for name in names:
    for i in range(1,11):
        byteName = name.encode('gbk')
        quoteName = parse.quote(byteName)
        url = r'http://news.search.hexun.com/news?key=%s&t=all&s=0&f=0&page=%s'%(quoteName,i)
        try:
             get_info(url=url,name=name)
        except:
            continue
    print('已经获得%s数据'%name)
    time.sleep(random(5,10))

转载于:https://my.oschina.net/u/3673704/blog/1570220

chuyijian7784

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
完成爬虫

财新网、东方财富、和讯网相关企业新闻搜索采集 '''财新网采集搜索新闻，该网站许多内容需要付费，所以需要登录'''from urllib import request, parsefrom bs4 import BeautifulSoupfrom random import choi...
复制链接

扫一扫