13 Python进行数据清洗优化

啥都鼓捣的小yao

已于 2023-11-26 04:27:41 修改

阅读量1k

点赞数 1

分类专栏： Python大数据挖掘与分析文章标签：数据库 python 爬虫数据分析

于 2022-09-20 22:11:19 首次发布

本文链接：https://blog.csdn.net/eric005/article/details/126867194

版权

Python大数据挖掘与分析专栏收录该内容

39 篇文章 7 订阅

订阅专栏

数据清洗优化

数据去重
数据清洗
日期格式统一
文本内容深度过滤——剔除噪声数据
总代码

数据去重

数据去重的思路是：爬取一条新闻数据后先查询数据库，如果发现该标题已存在，那么就不存入数据库中去。
查询数据库代码如下：

sql_1 = 'SELECT * FROM test WHERE company = %s' # 按公司名称选取数据
cur.execute(sql_1, company) # 执行SQL语句，选取公司名称为company的
data_all = cur.fetchall() # 提取所有数据
title_all = [] # 创建空列表存储新闻标题
for j in range(len(data_all)): # 遍历提取的数据
    title_all.append(data_all[j][1]) # 将数据中的新闻标题存入列表中

进行判断：

if title[i] not in title_all:
    sql_2 = 'INSERT INTO test(company, title, href, source, date) VALUES(%s, %s, %s, %s, %s)'
    cur.execute(sql_2, (company, title[i], href[i], spurce[i], date[i]))
    db.commit()

如果不在数据库内就执行存入操作，将上述代码汇总为：

for i in range(len(title)):
    db = pymysql.connect(host='localhost', port=3308, user='root', password='', database='pachong', charset='utf8')
    cur = db.cursor()  # 获取会话指针
    # 1.查询数据
    sql_1 = 'SELECT * FROM test WHERE company = %s' # 按公司名称选取数据
    cur.execute(sql_1, company) # 执行SQL语句，选取公司名称为company的
    data_all = cur.fetchall() # 提取所有数据
    title_all = [] # 创建空列表存储新闻标题
    for j in range(len(data_all)): # 遍历提取的数据
        title_all.append(data_all[j][1]) # 将数据中的新闻标题存入列表中
    # 2.判断是否存入数据库中，不在进行存入
    if title[i] not in title_all:
        sql_2 = 'INSERT INTO test(company, title, href, source, date) VALUES(%s, %s, %s, %s, %s)'
        cur.execute(sql_2, (company, title[i], href[i], spurce[i], date[i]))
        db.commit()
        cur.close()
        db.close()

数据清洗

# 1. 用strip()删除空格以及换行符等非相关符号
res = res.strip()
# 2. 用split()截取需要的内容
date = date.split('')[0]
# 3. 用sub()进行替换内容
title = '阿里巴巴<em>xxxx'
title = re.sub('<.*?>', '', title) # 将形式为<xxx>的内容换为空值

日期格式统一

import re
import time
from datetime import datetime

def beforeHours2Date(hours, date_format='%Y-%m-%d %H:%M:%S'):
    hours = int(hours)
    t = time.time() - hours * 60 * 60
    t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))
    return t

def parse_ymd(s):
    aa = re.findall(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", s)
    if aa != []:
        a = aa[0]

    else:

        if '月' in s:
            s = s.replace('月', '-').replace('日', '')
            mon_s, day_s = s.split('-')
            year_s = '2022'
            a = datetime(int(year_s), int(mon_s), int(day_s))
            a = a.strftime('%Y-%m-%d %H:%M:%S')

        long = re.findall('(-)', s)
        if len(long) == 2:
            year_s, mon_s, day_s = s.split('-')
            a = datetime(int(year_s), int(mon_s), int(day_s))
            a = a.strftime('%Y-%m-%d %H:%M:%S')

        if len(long) == 1:
            mon_s, day_s = s.split('-')
            year_s = '2022'
            a = datetime(int(year_s), int(mon_s), int(day_s))
            a = a.strftime('%Y-%m-%d %H:%M:%S')
        else:
            if '前天' in s:
                a = 3
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

            if '天前' in s:
                a = re.findall('(.*?)天前', s)
                a = beforeHours2Date(a[0], date_format='%Y-%m-%d %H:%M:%S')

            if '周前' in s:
                a = 7
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

            if '小时前' in s:
                a = re.findall('(.*?)小时前', s)
                a = beforeHours2Date(a[0], date_format='%Y-%m-%d %H:%M:%S')

            if '昨天' in s:
                a = 24
                b = re.findall('昨天 (.*)', s)
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
                #a = a + '' + b[0] + ':00'

            if '秒' in s:
                a = 0
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

            if '分钟前' in s:
                a = 0.5
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

            if '刚刚' in s:
                a = 0
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

    return a

for i in range(len(title)):  # range(len(title)),这里因为知道len(title) = 10，所以也可以写成for i in range(10)
    title[i] = title[i].strip()  # strip()函数用来取消字符串两端的换行或者空格，不过目前（2020-10）并没有换行或空格，所以其实不写这一行也没事
    title[i] = re.sub('<.*?>', '', title[i])  # 核心，用re.sub()函数来替换不重要的内容
    date[i] = parse_ymd(date[i])
    print(str(i + 1) + '.' + title[i] + '(' + source[i] + ' ' + date[i] + ')')
    print(href[i])

效果如下：
在这里插入图片描述

文本内容深度过滤——剔除噪声数据

1.根据新闻标题进行简单过滤

for i in range(len(title)):
    if company not in title[i]:
        title[i] = ''
        href[i] = ''
        date[i] = ''
        source[i] = ''
# 因为在for循环里把列表元素删除，len(title)并不会改变，循环次数是固定的，所以为了防止列表越界：list index out of range，我们先设为空值后再删除
while '' in title: # 遍历列表中的空字符串
    title.remove('') # 删除列表元素
while '' in title:
    href.remove('')
while '' in title:
    date.remove('')
while '' in title:
    source.remove('')

2.根据正文内容进行深度过滤

for i in range(len(title)):  # range(len(title)),这里因为知道len(title) = 10，所以也可以写成for i in range(10)
    title[i] = title[i].strip()  # strip()函数用来取消字符串两端的换行或者空格，不过目前（2020-10）并没有换行或空格，所以其实不写这一行也没事
    title[i] = re.sub('<.*?>', '', title[i])  # 核心，用re.sub()函数来替换不重要的内容
    date[i] = parse_ymd(date[i])
    try:
        article = requests.get(href[i], headers=headers, timeout=10).text # 获取正文信息
    except:
        article = '爬取失败'

    if company not in title[i] and company not in article: # 检查标题和正文是否包含公司名称
        title[i] = ''
        href[i] = ''
        date[i] = ''
        source[i] = ''

    print(str(i + 1) + '.' + title[i] + '(' + source[i] + ' ' + date[i] + ')')
    print(href[i])

# 因为在for循环里把列表元素删除，len(title)并不会改变，循环次数是固定的，所以为了防止列表越界：list index out of range，我们先设为空值后再删除
while '' in title: # 遍历列表中的空字符串
    title.remove('') # 删除列表元素
while '' in title:
    href.remove('')
while '' in title:
    date.remove('')
while '' in title:
    source.remove('')

注意：
若有公司简写等的，我们也需要此数据，那么我们可以把对于原先的if company not in article换成如下内容：

company_re = compay[0] + '.{0,5}' + company[-1]
if len(re.findall(company_re, article)) <1:

这里补充一个正则表达式，company[0]是公司名称第一个字，company[-1]表示公司名称最后一个字，.{0,5}中.表示任意一个字符，0,5表示0~5个字符（可进行修改），最后通过re.findall()进行寻找满足匹配规则的，若有则列表长度就大于1了，若没有则列表长度小于1，就可执行将该新闻赋值为空值的操作。

总代码

import requests
import re
import time
from datetime import datetime
#---------------数据爬虫----------------
company = '阿里巴巴'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}

url = 'https://www.baidu.com/s?tn=news&rtt=1&bsst=1&cl=2&wd='+company  # 把链接中rtt参数换成4即是按时间排序，默认为1按焦点排序
res = requests.get(url, headers=headers).text  # 加上headers用来告诉网站这是通过一个浏览器进行的访问
# print(res)

p_href = '<h3 class="news-title_1YtI1 "><a href="(.*?)"'
href = re.findall(p_href, res, re.S)
# print(href)
p_title = '<h3 class="news-title_1YtI1 ">.*?aria-label="标题：(.*?)"'
title = re.findall(p_title, res, re.S)
# print(title)
p_date = '<span class="c-color-gray2 c-font-normal c-gap-right-xsmall".*?>(.*?)</span>'
date = re.findall(p_date, res)
# print(date)
p_source = '<span class="c-color-gray".*?>(.*?)</span>'
source = re.findall(p_source, res)
# print(source)
#----------------时间处理函数----------------
def beforeHours2Date(hours, date_format='%Y-%m-%d %H:%M:%S'):
    hours = int(hours)
    t = time.time() - hours * 60 * 60
    t = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))
    return t

def parse_ymd(s):
    aa = re.findall(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", s)
    if aa != []:
        a = aa[0]

    else:

        if '月' in s:
            s = s.replace('月', '-').replace('日', '')
            mon_s, day_s = s.split('-')
            year_s = '2022'
            a = datetime(int(year_s), int(mon_s), int(day_s))
            a = a.strftime('%Y-%m-%d %H:%M:%S')

        long = re.findall('(-)', s)
        if len(long) == 2:
            year_s, mon_s, day_s = s.split('-')
            a = datetime(int(year_s), int(mon_s), int(day_s))
            a = a.strftime('%Y-%m-%d %H:%M:%S')

        if len(long) == 1:
            mon_s, day_s = s.split('-')
            year_s = '2022'
            a = datetime(int(year_s), int(mon_s), int(day_s))
            a = a.strftime('%Y-%m-%d %H:%M:%S')
        else:
            if '前天' in s:
                a = 3
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

            if '天前' in s:
                a = re.findall('(.*?)天前', s)
                a = beforeHours2Date(a[0], date_format='%Y-%m-%d %H:%M:%S')

            if '周前' in s:
                a = 7
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

            if '小时前' in s:
                a = re.findall('(.*?)小时前', s)
                a = beforeHours2Date(a[0], date_format='%Y-%m-%d %H:%M:%S')

            if '昨天' in s:
                a = 24
                b = re.findall('昨天 (.*)', s)
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')
                #a = a + '' + b[0] + ':00'

            if '秒' in s:
                a = 0
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

            if '分钟前' in s:
                a = 0.5
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

            if '刚刚' in s:
                a = 0
                a = beforeHours2Date(a, date_format='%Y-%m-%d %H:%M:%S')

    return a
#---------------主函数----------------
for i in range(len(title)):  # range(len(title)),这里因为知道len(title) = 10，所以也可以写成for i in range(10)
    title[i] = title[i].strip()  # strip()函数用来取消字符串两端的换行或者空格，不过目前（2020-10）并没有换行或空格，所以其实不写这一行也没事
    title[i] = re.sub('<.*?>', '', title[i])  # 核心，用re.sub()函数来替换不重要的内容
    date[i] = parse_ymd(date[i])
    try:
        article = requests.get(href[i], headers=headers, timeout=10).text # 获取正文信息
    except:
        article = '爬取失败'

    if company not in title[i] and company not in article: # 检查标题和正文是否包含公司名称
        title[i] = ''
        href[i] = ''
        date[i] = ''
        source[i] = ''

    print(str(i + 1) + '.' + title[i] + '(' + source[i] + ' ' + date[i] + ')')
    print(href[i])

#---------------剔除空操作----------------
# 因为在for循环里把列表元素删除，len(title)并不会改变，循环次数是固定的，所以为了防止列表越界：list index out of range，我们先设为空值后再删除
while '' in title: # 遍历列表中的空字符串
    title.remove('') # 删除列表元素
while '' in title:
    href.remove('')
while '' in title:
    date.remove('')
while '' in title:
    source.remove('')