参考链接
学习资料
xPath-菜鸟教程
正则表达式-菜鸟教程
正则表达式在线测试-菜鸟教程
正则表达式匹配中文
Python字符串的decode函数说明-菜鸟教程
利用正则表达式进行局部替换
decode报错
- 去目标网址右键->查看网页源码,在标签的content属性中找到该网页的编码格式,如
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
- decode使用ignore忽视个别识别问题,得到结果。如
page_data = requests.get(url).content.decode("gbk", errors='ignore')
代码
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 7 21:16:33 2021
@author: DELL
"""
'''
def pachong():
headers = {
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
# 所有股票的table
table_url = 'http://summary.jrj.com.cn/hybk/400115934.shtml'
page_data = requests.get(table_url).content.decode("gbk")
data_tree = etree.HTML(page_data)
# 爬取“证券代码”得到一个codes_list
print(data_tree)
if page_data.find("jrj-topDiv highlightedCol"):
codes_list = data_tree.xpath("//*[@class=\"jrj-topDiv\"]/a/@href")
print(codes_list)
# 对codes_list中的每一个code爬取对应的news_url
#news_url = 'http://stock.jrj.com.cn/share,' + code + ',ggxw.shtml'
# 读取系统当前日期,得到date_range = [当前日期-1年,当前日期]
# news_url的new_list里,爬取每个<li>里面的<span>和<i>
# <span>中的href是新闻内容的url,即news_content_url
# <i>的文本就是日期,要在date_range区间里
# 对每个news_content_url爬取title,date,content,origin,得到一个字典dic
# 将dic写入数据库
#pachong()
'''
import requests
import json
import pandas as pd
import time
import re
import datetime
from dateutil.relativedelta import relativedelta
from lxml import etree
import openpyxl
def spider():
'''
爬虫主函数
'''
#'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
headers = {
'User-Agent': 'Mozilla/5.0'
}
# 1050表示第一页,每页50行
url = 'http://q.jrjimg.cn/?q=cn|s|bk400115934&c=m&n=hqa&o=pl,d&p=1050&_dc=1631090941576'
#'http://q.jrjimg.cn/?q=cn|s|bk400115934&c=m&n=hqa&o=pl,d&p=2050&_dc=1631091010650'
r = requests.get(url, headers=headers) # 爬取数据
text = r.text # 获得文本
data = text_2_dict(text)
codes_list, stock_names = dict_2_codes_list(data)
code_news_dic = capture_page(codes_list, stock_names)
'''
code_news_dic={
stock_code:{
'content':[[news_date, news_url],...],
'name':stock_name,
'news_dics':[{'news_title':news_title, 'news_time':news_time, 'news_origin':news_origin, 'news_content':news_content},...]
},
...
}
'''
print('---------result---------')
#print(code_news_dic)
#capture_news(url)
#{'news_title':news_title, 'news_time':news_time, 'news_origin':news_origin, 'news_content':news_content}
n = len(code_news_dic.items())
count = 0
for k, v in code_news_dic.items():
count += 1
if count % (n//10) == 0:
print('#',end='')
stock_code = k
stock_name = v['name']
stock_date_urls = v['content']
v['news_dics'] = []
for news_date, news_url in stock_date_urls:
one_news_dic = capture_news(news_url)
v['news_dics'].append(one_news_dic)
print()
#print(code_news_dic['000593']['news_dics'])
def text_2_dict(text):
text = text.replace('\n','')
text = text.replace('var hqa=','')
text = text.replace(';','')
pattern = '(\w+)(\:)'
text = re.sub(pattern, lambda m:'\"' + m.group(1) + '\"' + m.group(2),text)
#print(text)
data = json.loads(text) # str转成json
return data
def dict_2_codes_list(data):
#print('json content',type(data))
#print(data,type(data))
lists = data['HqData']
codes_list = []
stock_names = []
for l in lists:
codes_list.append(l[1])
stock_names.append(l[2])
#print('codes_list')
#print(codes_list)
return codes_list, stock_names
def my_zip(a,b):
r = []
for i in range(len(a)):
r.append([a[i],b[i]])
return r
def page_2_list(Begin_date, page_data, data_tree):
news_date_href_li = []
Min_date = datetime.datetime.now().strftime('%Y-%m-%d')
if page_data.find("newlist") != -1:
news_title_li = list(map(str, data_tree.xpath("//ul[@class='newlist']/li/span/a/text()")))
#print(news_title_li)
news_href_li = list(map(str, data_tree.xpath("//ul[@class='newlist']/li/span/a/@href")))
news_date_li = list(map(str, data_tree.xpath("//ul[@class='newlist']/li/i/text()")))
news_date_href_li = my_zip(news_date_li, news_href_li)
#print(news_date_href_li[0],type(news_date_href_li[0][1]))
#'/html/body/div[6]/div[2]/div[2]/table/tbody/tr/td[2]/ul/li[2]/span/a'
# 过滤掉无用的新闻以及过旧的的新闻
pattern = '【龙虎榜】'
Min_date = my_filter(pattern, Begin_date, news_title_li, news_date_href_li)
#print('len(news_date_href_li) = {}'.format(len(news_date_href_li)))
#print(news_date_href_li)
return news_date_href_li, Min_date
def getDateBegin():
date_now = datetime.datetime.now()
#earlist_date = (date_now - relativedelta(years=1)).strftime('%Y-%m-%d')
earlist_date = (date_now - relativedelta(years=1))
return earlist_date
def my_filter(pattern, Begin_date, news_title_li, news_date_href_li):
Min_date = datetime.datetime.now()
# 先把包含【龙虎榜】的无用新闻去掉
idx = 0
n = len(news_title_li)
#print(len(news_title_li),len(news_date_href_li))
while idx < n:
title = news_title_li[idx]
if title.find(pattern) != -1:
#print(idx, title)
# 因为删除导致了下标变动,注意如果下标小于0,说明其实就是现在的第0位
del news_date_href_li[idx]
del news_title_li[idx]
n -= 1
else:
idx += 1
# 再把日期格式化
for idx, item in enumerate(news_date_href_li):
t = item[0]
#print(t)
detester = t.split(' ')[0]
date_ = datetime.datetime.strptime(detester,'%Y-%m-%d')
if date_ < Min_date:
Min_date = date_
item[0] = date_.strftime('%Y-%m-%d')
#print(item[0])
# 最后剔除超出时间范围的记录
filter_old(Begin_date, news_date_href_li)
return Min_date
def filter_old(Begin_date, news_date_href_li):
idx = 0
n = len(news_date_href_li)
while idx < n:
item = news_date_href_li[idx]
date_ = item[0]
#print(date_, Begin_date)
if datetime.datetime.strptime(date_, '%Y-%m-%d') < Begin_date:
#print('删除{}'.format(news_date_href_li[idx]))
del news_date_href_li[idx]
n -= 1
else:
idx += 1
def capture_page(codes_list, stock_names):
codes_names_list = my_zip(codes_list, stock_names)
code_news_dic = {}
for code, name in codes_names_list:
# 'http://stock.jrj.com.cn/share,' + code + ',ggxw.shtml'表示第一页
# 'http://stock.jrj.com.cn/share,' + code + ',ggxw_2.shtml'表示第二页
url = 'http://stock.jrj.com.cn/share,' + code + ',ggxw.shtml'
print('开始爬取:{}'.format(url))
page_data = requests.get(url).content.decode("gbk")
data_tree = etree.HTML(page_data)
Begin_date = getDateBegin()
code_news_dic[code] = {} #{{}}
# dic[key]->value={}
# value[key]->value_value=[[date,href],[date,href],...]
# dic ={}
# dic['zf'] = 2 == 向dic插入key='zf'再插入'zf'->value=2
code_news_dic[code]['content'], Min_date = page_2_list(Begin_date, page_data, data_tree)
code_news_dic[code]['name'] = name
if Min_date < Begin_date:
#print('Begin_date = {}, Min_date = {}'.format(Begin_date, Min_date))
print('Min_date < Begin_date,停止爬子网')
# 停止爬取
continue
else:
# 继续爬取
page_idx = 2
while Min_date >= Begin_date:
#print('Begin_date = {}, Min_date = {}'.format(Begin_date, Min_date))
url_sub = 'http://stock.jrj.com.cn/share,' + code + ',ggxw_' + str(page_idx) + '.shtml'
print('\t开始爬取:{}'.format(url_sub))
page_data = requests.get(url_sub).content.decode("gbk")
data_tree = etree.HTML(page_data)
code_news_dic_append, Min_date = page_2_list(Begin_date, page_data, data_tree)
#print('len(code_news_dic_append) = {}'.format(len(code_news_dic_append)))
if len(code_news_dic_append) == 0:
break
code_news_dic[code]['content'] += code_news_dic_append
page_idx += 1
#print(len(news_date_href_li))
return code_news_dic
def clean_for_news_title_or_time(s_list):
for idx, s in enumerate(s_list):
# 去掉换行符合空格,以及首行缩进
s = s.replace('\r', '')
s = s.replace('\n', '')
s_list[idx] = s.replace('\u3000', '')
return ''.join(s_list)
def clean_for_news_origin(s_list):
for idx, s in enumerate(s_list):
# 去掉换行符合空格,以及首行缩进
s = s.replace('\r', '')
s = s.replace('\n', '')
s = s.replace('\u3000', '')
# 去掉'【来源:'和'】'
s = re.sub('\【[\u4e00-\u9fa5]+\:', '', s)
s_list[idx] = re.sub('\】', '', s)
return ''.join(s_list)
def clean_for_news_content(s_list):
for idx, s in enumerate(s_list):
#print(s,re.search('\{\w+\:', s))
# 去掉style文本
# re.match当且仅当字符串的第一位开始就满足匹配,才返回非None
# re.search可以在字符串任意位置寻找匹配
if re.search('\{\w+\:', s) != None:
s_list[idx] = ''
#s = re.sub('\{\w+\:', lambda m:'\"' + m.group(1) + '\"' + m.group(2), s)
return ''.join(s_list)
def capture_news(url):
#print('get_content:')
#print(requests.get(url).content)
# 爬取新闻页内容
# <meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
page_data = requests.get(url).content.decode("gbk", errors='ignore')#gb2312,点击网页右键查看源码,在meta中可以看到
data_tree = etree.HTML(page_data)
result = dict({'news_title': '过期新闻原网址不显示!', 'news_time': '', 'news_origin': '', 'news_content': ''})
if page_data.find("titmain") != -1:
news_title = list(map(str, data_tree.xpath("//div[@class='titmain']/h1/text()")))
news_time = list(map(str, data_tree.xpath("//div[@class='titmain']/p[@class='inftop']/span[1]/text()")))
news_origin = list(map(str, data_tree.xpath("//div[@class='titInf210118']/p/i[1]/text()")))
news_content = list(map(str, data_tree.xpath("//div[@class='texttit_m1']//text()")))
#print('''
# 原始
# news_title: {}
# news_time: {}
# news_origin: {}
# news_content: {}
#'''.format(news_title, news_time, news_origin, news_content))
news_title = clean_for_news_title_or_time(news_title)
news_time = clean_for_news_title_or_time(news_time)
news_origin = clean_for_news_origin(news_origin)
news_content = clean_for_news_content(news_content)
#print('''
# 处理后
# news_title: {}
# news_time: {}
# news_origin: {}
# news_content: {}
#'''.format(news_title, news_time, news_origin, news_content))
result = dict({'news_title': news_title, 'news_time': news_time, 'news_origin': news_origin, 'news_content': news_content})
return result
spider()
#url = r'http://stock.jrj.com.cn/2021/06/15142232934084.shtml'
#capture_news(url)
写入数据库
import jieba
import pymysql as mysql
import collections
import re
def sql_connect():
mydb = mysql.connect(
host = "***.**.**.*", # 数据库主机地址
port = 3306, # 端口号
user = "root", # 数据库用户名
passwd = "*******", # 数据库密码
database = "mydb_hhh" # 选择一个数据库
)
return mydb
def select_data(mycursor):
#################删除表#############
sql = 'select id, news_content from news_list;'
#sql = "select news_content from news where code='000937' ;"
mycursor.execute(sql)
res = mycursor.fetchall()
print("----------查询记录成功!----------")
mydb.commit()
return res
#去除停用词
def out_stopword(seg):
#打开写入关键词的文件
wordlist = []
#获取停用词表
stopword = [line.strip() for line in open(r'C:\Users\DELL\Desktop\pp分词\hit_stopwords.txt',encoding='UTF-8').readlines()]
stopword += ' '
#print(stopword)
#遍历分词表
for word in seg:
#去除停用词,去除单字,去除重复词
if word not in stopword and word not in ['&','\t','|','╱','/','-','―','–','#-','○','≤','±','㎡','"','®','\xa0','{','}','[',']','(',')','●']:
wordlist.append(word)
print("------------去停用词成功!------------")
#print(wordlist)
return wordlist
def change_str(string):
pattern = '(\'[\u4e00-\u9fa5]+)(\')'#'中文'
string = re.sub(pattern, lambda m: '\\' + m.group(1) + '\\' + m.group(2), string)
pattern = '(\'\w+)(\')'#'英文'
string = re.sub(pattern, lambda m: '\\' + m.group(1) + '\\' + m.group(2), string)
#pattern = '(\'[0-9]+)(\')'
#string = re.sub(pattern, lambda m: '\\' + m.group(1) + '\\' + m.group(2), string)
pattern = '(\'\w*[0-9]*\.*[0-9]*\%*)(\')'#'数字'
string = re.sub(pattern, lambda m: '\\' + m.group(1) + '\\' + m.group(2), string)
return string
def insert_frequency(mycursor, fre, idx):
dic_str = str(fre)
dic_str = change_str(dic_str)
dic_str = '\'' + dic_str +'\''
print('写入id={}的行'.format(idx))
#print(dic_str)
sql = 'update news set word_frequency={} where id={};'.format(dic_str, idx)
#if flag == 1:
#print('sql为:{}'.format(sql))
mycursor.execute(sql)
print('update mysql successfully!')
def get_frequency(mydb, data):
# 利用jieba分词((''),(''),(''),...)
#flag=0
mycursor = mydb.cursor()
n = len(data)
fre_all = {}
for i in range(n):#range(n)
idx1 = data[i][0]
if data[i][1].find('fs') != -1 and data[i][1].find('var') != -1:
continue
r = list(jieba.cut(data[i][1]))
wordlist = out_stopword(r)
word_counts = dict(collections.Counter(wordlist))
word_counts = dict(sorted(word_counts.items(), key = lambda x:x[1], reverse = True))
#if '矿种' in word_counts.keys():
# print(word_counts)
# flag=1
#print("分词结果:{}".format(word_counts))
#print('str(dict)={}'.format(str(word_counts)))
# 写入数据库
#print(word_counts)
#print('当前处理id={}的行'.format(idx1))
insert_frequency(mycursor, word_counts, idx1)
#if flag == 1:
# insert_frequency(mycursor, word_counts, idx1, flag)
# flag=0
#else:
# insert_frequency(mycursor, word_counts, idx1, flag)
#if i % 10 == 0 or i == n-1:
merge(word_counts,fre_all)
mydb.commit()
mycursor.close()
#mycursor = mydb.cursor()
# 先定义一个dict
# 遍历词语list:如果该词语不在dict的key中,新建键值对;否则,直接dict[key] += 1
return fre_all
def merge(b,c):
for k_b,v_b in b.items():
if k_b in c.keys():
c[k_b] += v_b
else:
c[k_b] = v_b
#return c
#def writeExcel(wPath):
def insert_frequency_all(mydb, fre, idx):
mycursor = mydb.cursor()
dic_str = str(fre)
dic_str = change_str(dic_str)
dic_str = '\'' + dic_str +'\''
print('写入id={}的行'.format(idx))
#print(dic_str)
sql = 'update fre_all_table set fre_all={} where id={};'.format(dic_str, idx)
#if flag == 1:
#print('sql为:{}'.format(sql))
mycursor.execute(sql)
mydb.commit()
mycursor.close()
print('update mysql successfully!')
mydb = sql_connect()
mycursor = mydb.cursor()
data = select_data(mycursor)
mycursor.close()
fre_all = get_frequency(mydb, data)
fre_all = dict(sorted(fre_all.items(), key = lambda x:x[1], reverse = True))
insert_frequency_all(mydb, fre_all, 1)
mydb.close()
print(fre_all)