python爬取股票新闻

参考链接

CSDN

学习资料

xPath-菜鸟教程
正则表达式-菜鸟教程
正则表达式在线测试-菜鸟教程
正则表达式匹配中文
Python字符串的decode函数说明-菜鸟教程
利用正则表达式进行局部替换

decode报错

  • 去目标网址右键->查看网页源码,在标签的content属性中找到该网页的编码格式,如
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
  • decode使用ignore忽视个别识别问题,得到结果。如
page_data = requests.get(url).content.decode("gbk", errors='ignore')

代码

# -*- coding: utf-8 -*-
"""
Created on Tue Sep  7 21:16:33 2021

@author: DELL
"""


'''
def pachong():
    headers = {
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    }
    # 所有股票的table
    table_url = 'http://summary.jrj.com.cn/hybk/400115934.shtml'
    page_data = requests.get(table_url).content.decode("gbk")
    data_tree = etree.HTML(page_data)
    # 爬取“证券代码”得到一个codes_list
    print(data_tree)
    if page_data.find("jrj-topDiv highlightedCol"):
        codes_list = data_tree.xpath("//*[@class=\"jrj-topDiv\"]/a/@href")
        print(codes_list)
    # 对codes_list中的每一个code爬取对应的news_url
    #news_url = 'http://stock.jrj.com.cn/share,' + code + ',ggxw.shtml'
    
    # 读取系统当前日期,得到date_range = [当前日期-1年,当前日期]
    # news_url的new_list里,爬取每个<li>里面的<span>和<i>
    # <span>中的href是新闻内容的url,即news_content_url
    # <i>的文本就是日期,要在date_range区间里
    
    # 对每个news_content_url爬取title,date,content,origin,得到一个字典dic
    # 将dic写入数据库
    
#pachong()
'''

import requests
import json
import pandas as pd
import time
import re
import datetime
from dateutil.relativedelta import relativedelta
from lxml import etree
import openpyxl

def spider():
    '''
    爬虫主函数
    '''
    #'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    headers = {
            'User-Agent': 'Mozilla/5.0'
    }
    
    # 1050表示第一页,每页50行
    url = 'http://q.jrjimg.cn/?q=cn|s|bk400115934&c=m&n=hqa&o=pl,d&p=1050&_dc=1631090941576'
    #'http://q.jrjimg.cn/?q=cn|s|bk400115934&c=m&n=hqa&o=pl,d&p=2050&_dc=1631091010650'


    r = requests.get(url, headers=headers)  # 爬取数据
    text = r.text  # 获得文本
    data = text_2_dict(text)
    codes_list, stock_names = dict_2_codes_list(data)
    code_news_dic = capture_page(codes_list, stock_names)
    '''
    code_news_dic={
    stock_code:{
    'content':[[news_date, news_url],...],
    'name':stock_name,
    'news_dics':[{'news_title':news_title, 'news_time':news_time, 'news_origin':news_origin, 'news_content':news_content},...]
    },
    ...
    }
    '''
    print('---------result---------')
    #print(code_news_dic)
    #capture_news(url)
    #{'news_title':news_title, 'news_time':news_time, 'news_origin':news_origin, 'news_content':news_content}
    n = len(code_news_dic.items())
    count = 0
    for k, v in code_news_dic.items():
        count += 1
        if count % (n//10) == 0:
            print('#',end='')
        stock_code = k
        stock_name = v['name']
        stock_date_urls = v['content']
        v['news_dics'] = []
        for news_date, news_url in stock_date_urls:
            one_news_dic = capture_news(news_url)
            v['news_dics'].append(one_news_dic)
    print()
    #print(code_news_dic['000593']['news_dics'])
    
def text_2_dict(text):
    text = text.replace('\n','')
    text = text.replace('var hqa=','')
    text = text.replace(';','')
    pattern = '(\w+)(\:)'
    text = re.sub(pattern, lambda m:'\"' + m.group(1) + '\"' + m.group(2),text)
    #print(text)
    data = json.loads(text)  # str转成json
    return data

def dict_2_codes_list(data):
    #print('json content',type(data))
    #print(data,type(data))
    lists = data['HqData']
    codes_list = []
    stock_names = []
    for l in lists:
        codes_list.append(l[1])
        stock_names.append(l[2])
    #print('codes_list')
    #print(codes_list)
    return codes_list, stock_names

def my_zip(a,b):
    r = []
    for i in range(len(a)):
        r.append([a[i],b[i]])
    return r

def page_2_list(Begin_date, page_data, data_tree):
    news_date_href_li = []
    Min_date = datetime.datetime.now().strftime('%Y-%m-%d')
    if page_data.find("newlist") != -1:
        news_title_li = list(map(str, data_tree.xpath("//ul[@class='newlist']/li/span/a/text()")))
        #print(news_title_li)
        news_href_li = list(map(str, data_tree.xpath("//ul[@class='newlist']/li/span/a/@href")))
        news_date_li = list(map(str, data_tree.xpath("//ul[@class='newlist']/li/i/text()")))
        news_date_href_li = my_zip(news_date_li, news_href_li)
        #print(news_date_href_li[0],type(news_date_href_li[0][1]))
        #'/html/body/div[6]/div[2]/div[2]/table/tbody/tr/td[2]/ul/li[2]/span/a'
        
        # 过滤掉无用的新闻以及过旧的的新闻
        pattern = '【龙虎榜】'
        Min_date = my_filter(pattern, Begin_date, news_title_li, news_date_href_li)
        
        #print('len(news_date_href_li) = {}'.format(len(news_date_href_li)))
        #print(news_date_href_li)
    return news_date_href_li, Min_date
        
def getDateBegin():
    date_now = datetime.datetime.now()
    #earlist_date = (date_now - relativedelta(years=1)).strftime('%Y-%m-%d')
    earlist_date = (date_now - relativedelta(years=1))
    return earlist_date

def my_filter(pattern, Begin_date, news_title_li, news_date_href_li):
    Min_date = datetime.datetime.now()
    # 先把包含【龙虎榜】的无用新闻去掉
    idx = 0
    n = len(news_title_li)
    #print(len(news_title_li),len(news_date_href_li))
    while idx < n:
        title = news_title_li[idx]
        if title.find(pattern) != -1:
            #print(idx, title)
            # 因为删除导致了下标变动,注意如果下标小于0,说明其实就是现在的第0位
            del news_date_href_li[idx]
            del news_title_li[idx]
            
            n -= 1
        else:
            idx += 1
    # 再把日期格式化
    for idx, item in enumerate(news_date_href_li):
        t = item[0]
        #print(t)
        detester = t.split(' ')[0]
        date_ = datetime.datetime.strptime(detester,'%Y-%m-%d')
        
        if date_ < Min_date:
            Min_date = date_
        
        item[0] = date_.strftime('%Y-%m-%d')
        #print(item[0])
    # 最后剔除超出时间范围的记录
    filter_old(Begin_date, news_date_href_li)
    return Min_date

def filter_old(Begin_date, news_date_href_li):
    idx = 0
    n = len(news_date_href_li)
    while idx < n:
        item = news_date_href_li[idx]
        date_ = item[0]
        #print(date_, Begin_date)
        if datetime.datetime.strptime(date_, '%Y-%m-%d') < Begin_date:
            #print('删除{}'.format(news_date_href_li[idx]))
            del news_date_href_li[idx]
            n -= 1
        else:
            idx += 1
        

def capture_page(codes_list, stock_names):
    codes_names_list = my_zip(codes_list, stock_names)
    code_news_dic = {}
    for code, name in codes_names_list:
        # 'http://stock.jrj.com.cn/share,' + code + ',ggxw.shtml'表示第一页
        # 'http://stock.jrj.com.cn/share,' + code + ',ggxw_2.shtml'表示第二页
        url = 'http://stock.jrj.com.cn/share,' + code + ',ggxw.shtml'
        print('开始爬取:{}'.format(url))
        page_data = requests.get(url).content.decode("gbk")
        data_tree = etree.HTML(page_data)
        Begin_date = getDateBegin()
        code_news_dic[code] = {} #{{}}
        # dic[key]->value={}
        # value[key]->value_value=[[date,href],[date,href],...]
        # dic ={}
        # dic['zf'] = 2 == 向dic插入key='zf'再插入'zf'->value=2
        
        code_news_dic[code]['content'], Min_date = page_2_list(Begin_date, page_data, data_tree)
        code_news_dic[code]['name'] = name
        
        if Min_date < Begin_date:
            #print('Begin_date = {}, Min_date = {}'.format(Begin_date, Min_date))
            print('Min_date < Begin_date,停止爬子网')
            # 停止爬取
            continue
        else:
            # 继续爬取
            page_idx = 2
            while Min_date >= Begin_date:
                #print('Begin_date = {}, Min_date = {}'.format(Begin_date, Min_date))
                url_sub = 'http://stock.jrj.com.cn/share,' + code + ',ggxw_' + str(page_idx) + '.shtml'
                print('\t开始爬取:{}'.format(url_sub))
                page_data = requests.get(url_sub).content.decode("gbk")
                data_tree = etree.HTML(page_data)
                code_news_dic_append, Min_date = page_2_list(Begin_date, page_data, data_tree)
                #print('len(code_news_dic_append) = {}'.format(len(code_news_dic_append)))
                if len(code_news_dic_append) == 0:
                    break
                code_news_dic[code]['content'] += code_news_dic_append
                page_idx += 1
            
        #print(len(news_date_href_li))
    return code_news_dic

def clean_for_news_title_or_time(s_list):
    for idx, s in enumerate(s_list):
        # 去掉换行符合空格,以及首行缩进
        s = s.replace('\r', '')
        s = s.replace('\n', '')
        s_list[idx] = s.replace('\u3000', '')
    return ''.join(s_list)

def clean_for_news_origin(s_list):
    for idx, s in enumerate(s_list):
        # 去掉换行符合空格,以及首行缩进
        s = s.replace('\r', '')
        s = s.replace('\n', '')
        s = s.replace('\u3000', '')
        # 去掉'【来源:'和'】'
        s = re.sub('\【[\u4e00-\u9fa5]+\:', '', s)
        s_list[idx] = re.sub('\】', '', s)
    return ''.join(s_list)

def clean_for_news_content(s_list):
    for idx, s in enumerate(s_list):
        #print(s,re.search('\{\w+\:', s))
        # 去掉style文本
        # re.match当且仅当字符串的第一位开始就满足匹配,才返回非None
        # re.search可以在字符串任意位置寻找匹配
        if re.search('\{\w+\:', s) != None:
            s_list[idx] = ''
    #s = re.sub('\{\w+\:', lambda m:'\"' + m.group(1) + '\"' + m.group(2), s)
    return ''.join(s_list)
    
def capture_news(url):
    #print('get_content:')
    #print(requests.get(url).content)
    # 爬取新闻页内容
    # <meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
    page_data = requests.get(url).content.decode("gbk", errors='ignore')#gb2312,点击网页右键查看源码,在meta中可以看到
    data_tree = etree.HTML(page_data)
    result = dict({'news_title': '过期新闻原网址不显示!', 'news_time': '', 'news_origin': '', 'news_content': ''})
    if page_data.find("titmain") != -1:
        news_title = list(map(str, data_tree.xpath("//div[@class='titmain']/h1/text()")))
        news_time = list(map(str, data_tree.xpath("//div[@class='titmain']/p[@class='inftop']/span[1]/text()")))
        news_origin = list(map(str, data_tree.xpath("//div[@class='titInf210118']/p/i[1]/text()")))
        news_content = list(map(str, data_tree.xpath("//div[@class='texttit_m1']//text()")))
        
        #print('''
        #      原始
        #      news_title: {}
        #      news_time: {}
        #      news_origin: {}
        #     news_content: {}
        #'''.format(news_title, news_time, news_origin, news_content))
        
        news_title = clean_for_news_title_or_time(news_title)
        news_time = clean_for_news_title_or_time(news_time)
        news_origin = clean_for_news_origin(news_origin)
        news_content = clean_for_news_content(news_content)
        
        #print('''
        #      处理后
        #      news_title: {}
        #      news_time: {}
        #      news_origin: {}
        #      news_content: {}
        #'''.format(news_title, news_time, news_origin, news_content))
        result = dict({'news_title': news_title, 'news_time': news_time, 'news_origin': news_origin, 'news_content': news_content})
    return result
spider()
#url = r'http://stock.jrj.com.cn/2021/06/15142232934084.shtml'
#capture_news(url)

写入数据库

import jieba
import pymysql as mysql
import collections
import re


def sql_connect(): 
    mydb = mysql.connect(
        host = "***.**.**.*",  # 数据库主机地址
        port = 3306,  # 端口号
        user = "root",  # 数据库用户名
        passwd = "*******",  # 数据库密码
        database = "mydb_hhh"  # 选择一个数据库
    )
    return mydb

def select_data(mycursor):
    #################删除表#############
    sql = 'select id, news_content from news_list;'
    #sql = "select news_content from news where code='000937' ;"
    mycursor.execute(sql)
    res = mycursor.fetchall()
    print("----------查询记录成功!----------")
    mydb.commit()
    return res

#去除停用词
def out_stopword(seg):
    #打开写入关键词的文件
    wordlist = []
    #获取停用词表
    stopword = [line.strip() for line in open(r'C:\Users\DELL\Desktop\pp分词\hit_stopwords.txt',encoding='UTF-8').readlines()]
    stopword += ' '
    #print(stopword)
    #遍历分词表
    for word in seg:
        #去除停用词,去除单字,去除重复词
        if word not in stopword and word not in ['&','\t','|','╱','/','-','―','–','#-','○','≤','±','㎡','"','®','\xa0','{','}','[',']','(',')','●']:
            wordlist.append(word)
    print("------------去停用词成功!------------")
    #print(wordlist)
    return wordlist

def change_str(string):
    pattern = '(\'[\u4e00-\u9fa5]+)(\')'#'中文'
    string = re.sub(pattern, lambda m: '\\' + m.group(1) + '\\' + m.group(2), string)
    pattern = '(\'\w+)(\')'#'英文'
    string = re.sub(pattern, lambda m: '\\' + m.group(1) + '\\' + m.group(2), string)
    #pattern = '(\'[0-9]+)(\')'
    #string = re.sub(pattern, lambda m: '\\' + m.group(1) + '\\' + m.group(2), string)
    pattern = '(\'\w*[0-9]*\.*[0-9]*\%*)(\')'#'数字'
    string = re.sub(pattern, lambda m: '\\' + m.group(1) + '\\' + m.group(2), string)
    
    return string
    

def insert_frequency(mycursor, fre, idx):
    dic_str = str(fre)
    dic_str = change_str(dic_str)
    dic_str = '\'' + dic_str +'\''
    print('写入id={}的行'.format(idx))
    #print(dic_str)
    sql = 'update news set word_frequency={} where id={};'.format(dic_str, idx)
    #if flag == 1:
    #print('sql为:{}'.format(sql))
    mycursor.execute(sql)
    
    print('update mysql successfully!')
     
    

def get_frequency(mydb, data):
    # 利用jieba分词((''),(''),(''),...)
    #flag=0
    mycursor = mydb.cursor()
    n = len(data)
    
    fre_all = {}
    
    for i in range(n):#range(n)
        idx1 = data[i][0]
        if data[i][1].find('fs') != -1 and data[i][1].find('var') != -1:
            continue
        r = list(jieba.cut(data[i][1]))
        wordlist = out_stopword(r)
        word_counts = dict(collections.Counter(wordlist))
        word_counts = dict(sorted(word_counts.items(), key = lambda x:x[1], reverse = True))
        #if '矿种' in word_counts.keys():
        #    print(word_counts)
        #    flag=1
        #print("分词结果:{}".format(word_counts))
        #print('str(dict)={}'.format(str(word_counts)))
        # 写入数据库
        #print(word_counts)
        #print('当前处理id={}的行'.format(idx1))
        insert_frequency(mycursor, word_counts, idx1)
        #if flag == 1:
        #    insert_frequency(mycursor, word_counts, idx1, flag)
        #    flag=0
        #else:
        #    insert_frequency(mycursor, word_counts, idx1, flag)
        #if i % 10 == 0 or i == n-1:
        merge(word_counts,fre_all)
    mydb.commit()
    mycursor.close()
    
        #mycursor = mydb.cursor()
        
        
    # 先定义一个dict
    # 遍历词语list:如果该词语不在dict的key中,新建键值对;否则,直接dict[key] += 1
    return fre_all

def merge(b,c):
    for k_b,v_b in b.items():
        if k_b in c.keys():
            c[k_b] += v_b
        else:
            c[k_b] = v_b
    #return c

#def writeExcel(wPath):
def insert_frequency_all(mydb, fre, idx):
    mycursor = mydb.cursor()
    dic_str = str(fre)
    dic_str = change_str(dic_str)
    dic_str = '\'' + dic_str +'\''
    print('写入id={}的行'.format(idx))
    #print(dic_str)
    sql = 'update fre_all_table set fre_all={} where id={};'.format(dic_str, idx)
    #if flag == 1:
    #print('sql为:{}'.format(sql))
    mycursor.execute(sql)
    mydb.commit()
    mycursor.close()
    print('update mysql successfully!')

mydb = sql_connect()
mycursor = mydb.cursor()
data = select_data(mycursor)
mycursor.close()

fre_all = get_frequency(mydb, data)
fre_all = dict(sorted(fre_all.items(), key = lambda x:x[1], reverse = True))

insert_frequency_all(mydb, fre_all, 1)
mydb.close()

print(fre_all)


  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值