数据去重
数据去重的思路是:爬取一条新闻数据后先查询数据库,如果发现该标题已存在,那么就不存入数据库中去。
查询数据库代码如下:
sql_1 = 'SELECT * FROM test WHERE company = %s' # 按公司名称选取数据
cur.execute(sql_1, company) # 执行SQL语句,选取公司名称为company的
data_all = cur.fetchall() # 提取所有数据
title_all = [] # 创建空列表存储新闻标题
for j in range(len(data_all)): # 遍历提取的数据
title_all.append(data_all[j][1]) # 将数据中的新闻标题存入列表中
进行判断:
if title[i] not in title_all:
sql_2 = 'INSERT INTO test(company, title, href, source, date) VALUES(%s, %s, %s, %s, %s)'
cur.execute(sql_2, (company, title[i], href[i], spurce[i], date[i]))
db.commit()
如果不在数据库内就执行存入操作,将上述代码汇总为:
for i in range(len(title)):
db = pymysql.connect(host='localhost', port=3308, user='root', password='', database='pachong', charset='utf8')
cur = db.cursor() # 获取会话指针
# 1.查询数据
sql_1 = 'SELECT * FROM test WHERE company = %s' # 按公司名称选取数据
cur.execute(sql_1, company) # 执行SQL语句,选取公司名称为company的
data_all = cur.fetchall() # 提取所有数据
title_all = [] # 创建空列表存储新闻标题
for j in range(len(data_all)): # 遍历提取的数据
title_all.append(data_all[j][1]) # 将数据中的新闻标题存入列表中
# 2.判断是否存入数据库中,不在进行存入
if title[i] not in title_all:
sql_2 = 'INSERT INTO test(company, title, href, source, date) VALUES(%s, %s, %s, %s, %s)'
cur.execute(sql_2, (company, title[i], href[i], spurce[i], date[i]))
db.commit()
cur.close()
db.close()
数据清洗
# 1. 用strip()删除空格以及换行符等非相关符号
res = res.strip()
# 2. 用split()截取需要的内容
date = date.split('')[0]
# 3. 用sub()进行替换内容
title = '阿里巴巴<em>xxxx'
title = re.sub('<.*?>', '', title) # 将形式为<xxx>的内容换为空值
日期格式统一
import re
import time
from datetime import datetime
def beforeHours2Date(hours, date_format='%Y-%m-%d %H:%M:%S'):
hours = int(hours)
t = time.time() - hours * 60 * 60
t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))
return t
def parse_ymd(s):
aa = re.findall(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", s)
if aa != []:
a = aa[0]
else:
if '月' in s:
s = s.replace('月', '-').replace('日', '')
mon_s, day_s = s.split('-')
year_s = '2022'
a = datetime(int(year_s), int(mon_s), int(day_s))
a = a.strftime('%Y-%m-%d %H:%M:%S')
long = re.findall('(-)', s)
if len(long) == 2:
year_s, mon_s, day_s = s.split('-')
a = datetime(int(year_s), int(mon_s), int(day_s))
a = a.strftime('%Y-%m-%d %H:%M:%S')
if len(long) == 1:
mon_s, day_s = s.split('-')
year_s = '2022'
a = datetime(int(year_s), int(mon_s), int(day_s))
a = a.strftime('%Y-%m-%d %H:%M:%S')
else:
if '前天' in s:
a = 3
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
if '天前' in s:
a = re.findall('(.*?)天前', s)
a = beforeHours2Date(a[0], date_format='%Y-%m-%d %H:%M:%S')
if '周前' in s:
a = 7
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
if '小时前' in s:
a = re.findall('(.*?)小时前', s)
a = beforeHours2Date(a[0], date_format='%Y-%m-%d %H:%M:%S')
if '昨天' in s:
a = 24
b = re.findall('昨天 (.*)', s)
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
#a = a + '' + b[0] + ':00'
if '秒' in s:
a = 0
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
if '分钟前' in s:
a = 0.5
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
if '刚刚' in s:
a = 0
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
return a
for i in range(len(title)): # range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10)
title[i] = title[i].strip() # strip()函数用来取消字符串两端的换行或者空格,不过目前(2020-10)并没有换行或空格,所以其实不写这一行也没事
title[i] = re.sub('<.*?>', '', title[i]) # 核心,用re.sub()函数来替换不重要的内容
date[i] = parse_ymd(date[i])
print(str(i + 1) + '.' + title[i] + '(' + source[i] + ' ' + date[i] + ')')
print(href[i])
效果如下:
文本内容深度过滤——剔除噪声数据
1.根据新闻标题进行简单过滤
for i in range(len(title)):
if company not in title[i]:
title[i] = ''
href[i] = ''
date[i] = ''
source[i] = ''
# 因为在for循环里把列表元素删除,len(title)并不会改变,循环次数是固定的,所以为了防止列表越界:list index out of range,我们先设为空值后再删除
while '' in title: # 遍历列表中的空字符串
title.remove('') # 删除列表元素
while '' in title:
href.remove('')
while '' in title:
date.remove('')
while '' in title:
source.remove('')
2.根据正文内容进行深度过滤
for i in range(len(title)): # range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10)
title[i] = title[i].strip() # strip()函数用来取消字符串两端的换行或者空格,不过目前(2020-10)并没有换行或空格,所以其实不写这一行也没事
title[i] = re.sub('<.*?>', '', title[i]) # 核心,用re.sub()函数来替换不重要的内容
date[i] = parse_ymd(date[i])
try:
article = requests.get(href[i], headers=headers, timeout=10).text # 获取正文信息
except:
article = '爬取失败'
if company not in title[i] and company not in article: # 检查标题和正文是否包含公司名称
title[i] = ''
href[i] = ''
date[i] = ''
source[i] = ''
print(str(i + 1) + '.' + title[i] + '(' + source[i] + ' ' + date[i] + ')')
print(href[i])
# 因为在for循环里把列表元素删除,len(title)并不会改变,循环次数是固定的,所以为了防止列表越界:list index out of range,我们先设为空值后再删除
while '' in title: # 遍历列表中的空字符串
title.remove('') # 删除列表元素
while '' in title:
href.remove('')
while '' in title:
date.remove('')
while '' in title:
source.remove('')
注意:
若有公司简写等的,我们也需要此数据,那么我们可以把对于原先的if company not in article换成如下内容:
company_re = compay[0] + '.{0,5}' + company[-1]
if len(re.findall(company_re, article)) <1:
这里补充一个正则表达式,company[0]是公司名称第一个字,company[-1]表示公司名称最后一个字,.{0,5}中.表示任意一个字符,0,5表示0~5个字符(可进行修改),最后通过re.findall()进行寻找满足匹配规则的,若有则列表长度就大于1了,若没有则列表长度小于1,就可执行将该新闻赋值为空值的操作。
总代码
import requests
import re
import time
from datetime import datetime
#---------------数据爬虫----------------
company = '阿里巴巴'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}
url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd='+company # 把链接中rtt参数换成4即是按时间排序,默认为1按焦点排序
res = requests.get(url, headers=headers).text # 加上headers用来告诉网站这是通过一个浏览器进行的访问
# print(res)
p_href = '<h3 class="news-title_1YtI1 "><a href="(.*?)"'
href = re.findall(p_href, res, re.S)
# print(href)
p_title = '<h3 class="news-title_1YtI1 ">.*?aria-label="标题:(.*?)"'
title = re.findall(p_title, res, re.S)
# print(title)
p_date = '<span class="c-color-gray2 c-font-normal c-gap-right-xsmall".*?>(.*?)</span>'
date = re.findall(p_date, res)
# print(date)
p_source = '<span class="c-color-gray".*?>(.*?)</span>'
source = re.findall(p_source, res)
# print(source)
#----------------时间处理函数----------------
def beforeHours2Date(hours, date_format='%Y-%m-%d %H:%M:%S'):
hours = int(hours)
t = time.time() - hours * 60 * 60
t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))
return t
def parse_ymd(s):
aa = re.findall(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", s)
if aa != []:
a = aa[0]
else:
if '月' in s:
s = s.replace('月', '-').replace('日', '')
mon_s, day_s = s.split('-')
year_s = '2022'
a = datetime(int(year_s), int(mon_s), int(day_s))
a = a.strftime('%Y-%m-%d %H:%M:%S')
long = re.findall('(-)', s)
if len(long) == 2:
year_s, mon_s, day_s = s.split('-')
a = datetime(int(year_s), int(mon_s), int(day_s))
a = a.strftime('%Y-%m-%d %H:%M:%S')
if len(long) == 1:
mon_s, day_s = s.split('-')
year_s = '2022'
a = datetime(int(year_s), int(mon_s), int(day_s))
a = a.strftime('%Y-%m-%d %H:%M:%S')
else:
if '前天' in s:
a = 3
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
if '天前' in s:
a = re.findall('(.*?)天前', s)
a = beforeHours2Date(a[0], date_format='%Y-%m-%d %H:%M:%S')
if '周前' in s:
a = 7
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
if '小时前' in s:
a = re.findall('(.*?)小时前', s)
a = beforeHours2Date(a[0], date_format='%Y-%m-%d %H:%M:%S')
if '昨天' in s:
a = 24
b = re.findall('昨天 (.*)', s)
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
#a = a + '' + b[0] + ':00'
if '秒' in s:
a = 0
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
if '分钟前' in s:
a = 0.5
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
if '刚刚' in s:
a = 0
a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
return a
#---------------主函数----------------
for i in range(len(title)): # range(len(title)),这里因为知道len(title) = 10,所以也可以写成for i in range(10)
title[i] = title[i].strip() # strip()函数用来取消字符串两端的换行或者空格,不过目前(2020-10)并没有换行或空格,所以其实不写这一行也没事
title[i] = re.sub('<.*?>', '', title[i]) # 核心,用re.sub()函数来替换不重要的内容
date[i] = parse_ymd(date[i])
try:
article = requests.get(href[i], headers=headers, timeout=10).text # 获取正文信息
except:
article = '爬取失败'
if company not in title[i] and company not in article: # 检查标题和正文是否包含公司名称
title[i] = ''
href[i] = ''
date[i] = ''
source[i] = ''
print(str(i + 1) + '.' + title[i] + '(' + source[i] + ' ' + date[i] + ')')
print(href[i])
#---------------剔除空操作----------------
# 因为在for循环里把列表元素删除,len(title)并不会改变,循环次数是固定的,所以为了防止列表越界:list index out of range,我们先设为空值后再删除
while '' in title: # 遍历列表中的空字符串
title.remove('') # 删除列表元素
while '' in title:
href.remove('')
while '' in title:
date.remove('')
while '' in title:
source.remove('')