python 清新脏数据

最新推荐文章于 2022-03-28 10:39:40 发布

逍遥_yjz

最新推荐文章于 2022-03-28 10:39:40 发布

阅读量930

点赞数

分类专栏： python基础

本文链接：https://blog.csdn.net/xiaoyaozizai017/article/details/75040411

版权

python基础专栏收录该内容

24 篇文章 0 订阅

订阅专栏

调取文本数据，清洗后写入文本

# -*- coding: utf-8 -*-
#!/usr/bin/env python
import sys
import re
import json
import xlwt
import xlrd
from xlutils.copy import copy

reload(sys)
sys.setdefaultencoding('utf-8')

#清洗脏数据
def testMunicipalCommitteeContent():
    #print '清洗脏数据'
    data = xlrd.open_workbook(u'D:\PythonFiles\clearData\今日头条.xlsx')
    table = data.sheets()[0]
    nrows = table.nrows
    ncols = table.ncols  # 列数

    wb = xlwt.Workbook()
    ws = wb.add_sheet('News', cell_overwrite_ok=True)

    for x in range(ncols):
        # print type(table.row_values(0)[x])
        # print type(u'你好')
        ws.write(0, x, table.row_values(0)[x])
        ws.write(1, x, table.row_values(1)[x])
    ws.write(0, ncols, u'图片链接')
    ws.write(1, ncols, 'picture_id')
    for i in range(2,nrows):
        #print '内容'
        #print table.row_values(5)[ncols-10]
        s = ''
        if table.row_values(i)[ncols-10]:
            s1=table.row_values(i)[ncols-10]
            #print s1
            if re.findall(r'src=""(.+?)""', s1):
                s=re.findall(r'src=""(.+?)""', s1)[0]
            else:
                s=''
            str1=re.findall(r'<p>(.+?)</p>', s1)
            str=''
            for j in range(len(str1)):
                str +=str1[j]
            #print str
            resultMiddle=re.subn(u'<img(.*?)"">', '', str)
            resultMiddle = re.subn(u'<strong>(.*?)</strong>', '', resultMiddle[0])
            resultMiddle = re.subn(u'↑(.*?)关注我们', '', resultMiddle[0])
            resultMiddle = re.subn(u'<b(.*?)r>', '', resultMiddle[0])
            result= resultMiddle[0]
        else:
            result=''
            #print '空格是空的'
        for m in range(ncols-10):
            ws.write(i, m, table.row_values(i)[m])
        ws.write(i,ncols-10,result)
        for m in range(ncols-9,ncols):
            ws.write(i, m, table.row_values(i)[m])
        ws.write(i, ncols, s)
    wb.save(r'D:\PythonFiles\clearData\todayNews.xls')

if __name__ == '__main__':
    testMunicipalCommitteeContent()

清洗数据，把正则表达式写到文本上读取，清洗

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2016 R&D, CINS Inc. (cins.com)
#
# Author: Eric x.sun <followyourheart1211@gmail.com>
#

import os
import re
import sys
from optparse import OptionParser

import common_filter_regex
import settings

reload(sys)
sys.setdefaultencoding("utf-8")

# 内容中的http开头的URL
REGEX_URL = "http" + u"[^\u4e00-\u9fa5]+"

# .jpg or .JPG
REGEX_JPG = ".jpg"


def remove_useless(content, fileurl, encoding="utf-8"):
    """remove the useless in content.

    Args:
        content: The content, in string format, to be replaced.
        encoding: The encode of content.

    Returns:
        The content after removing.
    """

    REGEX = open(fileurl, 'r')

    REGEX_TEXT = []
    for eachline in REGEX:
        eachline = eachline.strip().decode(encoding)
        REGEX_TEXT.append(eachline)
    content = content.strip()

    if content:

        for line in REGEX_TEXT:

            content = re.subn(line,'', content.decode(encoding))[0]


        content = re.subn(REGEX_URL,"", content.decode(encoding))[0]
        content = re.subn(REGEX_JPG,"", content.decode(encoding))[0]

        # 句首标点
        content = content.decode(encoding).lstrip(settings.PUNCTUATIONS)
    print '111111111111111'
    print content.strip()
    # 句首句末空格

    return content.strip()


def replace_with_space(content):
    """replace the useless in content with one space.

    Args:
        content: The content, in string format, to be replaced.

    Returns:
        The content after replacing.
    """

    content = content.strip()

    if content:
        # 内容中的\n, \r, \t
        # see common_replacer.py done at first.

        # html
        content = common_filter_regex.replace_html_tags(content)
        content = common_filter_regex.replace_html_char_entity(content)
        content = common_filter_regex.replace_html_url(content, " ")

        # [图片]
        content = re.compile(ur"[\u56fe\u7247]").sub(" ", content)

        # 连续空格
        content = re.compile(ur"\s{2,}").sub(" ", content)

    return content.strip()


def read_input(fd, delimiter):
    for obj in fd:
        yield obj.strip().split(delimiter)


def check_parameters(**kwargs):
    """Check whether the parameters satisfy the conditions.

    Args:
        delimiter: The delimiter between columns.
        indexes: A array of indexes of the content.
        data: The file name of the data.

    Returns:
        A boolean value for representing the status of the checking.
    """

    delimiter = kwargs.get("delimiter", None)
    if delimiter is None:
        msg = [
            "The delimiter is required.",
            "Use '-s' in console mode or 'delimiter=' in func call to set it."
        ]

        print("{0}".format("\n".join(msg)))
        return False

    indexes = kwargs.get('index', 0)
    if indexes is None:
        msg = [
            "The indexes is required.",
            "Use '-i' in console mode or 'index=' in func call to set it."
        ]

        print("{0}".format("\n".join(msg)))
        return False

    data = kwargs.get('data', None)
    if data is not None and not os.path.isfile(data):
        print("The data does not exist: {0}.".format(data))
        return False

    return True


def main(delimiter, indexes, data, out, clean):
    indexes = map(lambda i: int(i) - 1, filter(lambda i: i.isdigit(), indexes.split("|")))
    stdin = sys.stdin if data is None else open(data, "rb")
    stdout = sys.stdout if out is None else open(out, "wb")

    delimiter = settings.FIELD_DELIMITER[delimiter] if delimiter in settings.FIELD_DELIMITER.keys() else delimiter
    print delimiter, indexes
    # temp = 0
    for obj in read_input(stdin, delimiter):
        empty_line = False
        # temp = temp + 1
        # if temp == 20000:
        #     break
        for i in indexes:
            print i
            try:
                print '================='
                print ''.join(obj[i])
                str=''
                str1= re.findall(r'<p>(.+?)</p>', obj[i])
                print ''.join(str1)
                print str1[0]
                if len(str1) == 0:
                    str=obj[i]
                else:
                    for k in str1:
                        str +=k
                print '999999999999999'
                print str
                obj[i] = replace_with_space(remove_useless(str, clean, "utf-8"))
                #print remove_useless(str, clean, "utf-8")
                #obj[i] = remove_useless(str, clean, "utf-8")
                print '8888888888888888888'
                print obj[i]
                empty_line = False if obj[i] else True
            # obj[i] = remove_useless(obj[i], clean, "utf-8")
            except:
                pass
                # print len(obj)
        if not empty_line:
            stdout.write("{0}\n".format(delimiter.join(obj).strip()))


    if data is not None:
        stdin.close()
    if out is not None:
        stdout.close()

    return True


if __name__ == "__main__":
    data_firl = r"D:\PythonFiles\clearData\2017041820.news_zhengwen"
    #data_firl = r"D:\PythonFiles\clearData\test.news_zhengwen"
    out_firl=r"D:\PythonFiles\clearData\tetete.news_zhengwen"
    clean_firl = 'clearn.txt'
    main('\001', '19', data_firl, out_firl,clean_firl)
    # parser = OptionParser(usage="%prog -s delimiter -i index_array -d data  -o out -c clean")
    #
    # parser.add_option(
    #     "-s", "--delimiter",
    #     help=u"The delimiter between columns, like \001"
    # )
    #
    # parser.add_option(
    #     "-i", '--index_array',
    #     help=u"Array of index in content, that need to been cleaned, starts at 1, like \"1|3|4\"."
    # )
    #
    # parser.add_option(
    #     "-d", "--data",
    #     help=u"The file name of the data to be tagged(includes the full path)"
    # )
    #
    # parser.add_option(
    #     "-o", "--out",
    #     help=u"The file name of the cleaned data(includes the full path)"
    # )
    #
    # parser.add_option(
    #     "-c", "--clean",
    #     help=u"The file name of the file of cleaning data(includes the full path)"
    # )
    #
    # if not sys.argv[1:]:
    #     parser.print_help()
    #     exit(1)
    #
    # (opts, args) = parser.parse_args()
    # main(delimiter=opts.delimiter, indexes=opts.index_array, data=opts.data, out=opts.out, clean=opts.clean)

清洗添加数据 clearn.txt ：

<img(.*?)">
<strong>(.*?)</strong>
↑(.*?)关注我们
<b(.*?)r>

参考资料：https://blog.csdn.net/m0_37717595/article/details/80603884

逍遥_yjz

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录