调取文本数据,清洗后写入文本
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import sys
import re
import json
import xlwt
import xlrd
from xlutils.copy import copy
reload(sys)
sys.setdefaultencoding('utf-8')
#清洗脏数据
def testMunicipalCommitteeContent():
#print '清洗脏数据'
data = xlrd.open_workbook(u'D:\PythonFiles\clearData\今日头条.xlsx')
table = data.sheets()[0]
nrows = table.nrows
ncols = table.ncols # 列数
wb = xlwt.Workbook()
ws = wb.add_sheet('News', cell_overwrite_ok=True)
for x in range(ncols):
# print type(table.row_values(0)[x])
# print type(u'你好')
ws.write(0, x, table.row_values(0)[x])
ws.write(1, x, table.row_values(1)[x])
ws.write(0, ncols, u'图片链接')
ws.write(1, ncols, 'picture_id')
for i in range(2,nrows):
#print '内容'
#print table.row_values(5)[ncols-10]
s = ''
if table.row_values(i)[ncols-10]:
s1=table.row_values(i)[ncols-10]
#print s1
if re.findall(r'src=""(.+?)""', s1):
s=re.findall(r'src=""(.+?)""', s1)[0]
else:
s=''
str1=re.findall(r'<p>(.+?)</p>', s1)
str=''
for j in range(len(str1)):
str +=str1[j]
#print str
resultMiddle=re.subn(u'<img(.*?)"">', '', str)
resultMiddle = re.subn(u'<strong>(.*?)</strong>', '', resultMiddle[0])
resultMiddle = re.subn(u'↑(.*?)关注我们', '', resultMiddle[0])
resultMiddle = re.subn(u'<b(.*?)r>', '', resultMiddle[0])
result= resultMiddle[0]
else:
result=''
#print '空格是空的'
for m in range(ncols-10):
ws.write(i, m, table.row_values(i)[m])
ws.write(i,ncols-10,result)
for m in range(ncols-9,ncols):
ws.write(i, m, table.row_values(i)[m])
ws.write(i, ncols, s)
wb.save(r'D:\PythonFiles\clearData\todayNews.xls')
if __name__ == '__main__':
testMunicipalCommitteeContent()
清洗数据,把正则表达式写到文本上读取,清洗
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2016 R&D, CINS Inc. (cins.com)
#
# Author: Eric x.sun <followyourheart1211@gmail.com>
#
import os
import re
import sys
from optparse import OptionParser
import common_filter_regex
import settings
reload(sys)
sys.setdefaultencoding("utf-8")
# 内容中的http开头的URL
REGEX_URL = "http" + u"[^\u4e00-\u9fa5]+"
# .jpg or .JPG
REGEX_JPG = ".jpg"
def remove_useless(content, fileurl, encoding="utf-8"):
"""remove the useless in content.
Args:
content: The content, in string format, to be replaced.
encoding: The encode of content.
Returns:
The content after removing.
"""
REGEX = open(fileurl, 'r')
REGEX_TEXT = []
for eachline in REGEX:
eachline = eachline.strip().decode(encoding)
REGEX_TEXT.append(eachline)
content = content.strip()
if content:
for line in REGEX_TEXT:
content = re.subn(line,'', content.decode(encoding))[0]
content = re.subn(REGEX_URL,"", content.decode(encoding))[0]
content = re.subn(REGEX_JPG,"", content.decode(encoding))[0]
# 句首标点
content = content.decode(encoding).lstrip(settings.PUNCTUATIONS)
print '111111111111111'
print content.strip()
# 句首句末空格
return content.strip()
def replace_with_space(content):
"""replace the useless in content with one space.
Args:
content: The content, in string format, to be replaced.
Returns:
The content after replacing.
"""
content = content.strip()
if content:
# 内容中的\n, \r, \t
# see common_replacer.py done at first.
# html
content = common_filter_regex.replace_html_tags(content)
content = common_filter_regex.replace_html_char_entity(content)
content = common_filter_regex.replace_html_url(content, " ")
# [图片]
content = re.compile(ur"[\u56fe\u7247]").sub(" ", content)
# 连续空格
content = re.compile(ur"\s{2,}").sub(" ", content)
return content.strip()
def read_input(fd, delimiter):
for obj in fd:
yield obj.strip().split(delimiter)
def check_parameters(**kwargs):
"""Check whether the parameters satisfy the conditions.
Args:
delimiter: The delimiter between columns.
indexes: A array of indexes of the content.
data: The file name of the data.
Returns:
A boolean value for representing the status of the checking.
"""
delimiter = kwargs.get("delimiter", None)
if delimiter is None:
msg = [
"The delimiter is required.",
"Use '-s' in console mode or 'delimiter=' in func call to set it."
]
print("{0}".format("\n".join(msg)))
return False
indexes = kwargs.get('index', 0)
if indexes is None:
msg = [
"The indexes is required.",
"Use '-i' in console mode or 'index=' in func call to set it."
]
print("{0}".format("\n".join(msg)))
return False
data = kwargs.get('data', None)
if data is not None and not os.path.isfile(data):
print("The data does not exist: {0}.".format(data))
return False
return True
def main(delimiter, indexes, data, out, clean):
indexes = map(lambda i: int(i) - 1, filter(lambda i: i.isdigit(), indexes.split("|")))
stdin = sys.stdin if data is None else open(data, "rb")
stdout = sys.stdout if out is None else open(out, "wb")
delimiter = settings.FIELD_DELIMITER[delimiter] if delimiter in settings.FIELD_DELIMITER.keys() else delimiter
print delimiter, indexes
# temp = 0
for obj in read_input(stdin, delimiter):
empty_line = False
# temp = temp + 1
# if temp == 20000:
# break
for i in indexes:
print i
try:
print '================='
print ''.join(obj[i])
str=''
str1= re.findall(r'<p>(.+?)</p>', obj[i])
print ''.join(str1)
print str1[0]
if len(str1) == 0:
str=obj[i]
else:
for k in str1:
str +=k
print '999999999999999'
print str
obj[i] = replace_with_space(remove_useless(str, clean, "utf-8"))
#print remove_useless(str, clean, "utf-8")
#obj[i] = remove_useless(str, clean, "utf-8")
print '8888888888888888888'
print obj[i]
empty_line = False if obj[i] else True
# obj[i] = remove_useless(obj[i], clean, "utf-8")
except:
pass
# print len(obj)
if not empty_line:
stdout.write("{0}\n".format(delimiter.join(obj).strip()))
if data is not None:
stdin.close()
if out is not None:
stdout.close()
return True
if __name__ == "__main__":
data_firl = r"D:\PythonFiles\clearData\2017041820.news_zhengwen"
#data_firl = r"D:\PythonFiles\clearData\test.news_zhengwen"
out_firl=r"D:\PythonFiles\clearData\tetete.news_zhengwen"
clean_firl = 'clearn.txt'
main('\001', '19', data_firl, out_firl,clean_firl)
# parser = OptionParser(usage="%prog -s delimiter -i index_array -d data -o out -c clean")
#
# parser.add_option(
# "-s", "--delimiter",
# help=u"The delimiter between columns, like \001"
# )
#
# parser.add_option(
# "-i", '--index_array',
# help=u"Array of index in content, that need to been cleaned, starts at 1, like \"1|3|4\"."
# )
#
# parser.add_option(
# "-d", "--data",
# help=u"The file name of the data to be tagged(includes the full path)"
# )
#
# parser.add_option(
# "-o", "--out",
# help=u"The file name of the cleaned data(includes the full path)"
# )
#
# parser.add_option(
# "-c", "--clean",
# help=u"The file name of the file of cleaning data(includes the full path)"
# )
#
# if not sys.argv[1:]:
# parser.print_help()
# exit(1)
#
# (opts, args) = parser.parse_args()
# main(delimiter=opts.delimiter, indexes=opts.index_array, data=opts.data, out=opts.out, clean=opts.clean)
清洗添加数据 clearn.txt :
<img(.*?)">
<strong>(.*?)</strong>
↑(.*?)关注我们
<b(.*?)r>
参考资料:https://blog.csdn.net/m0_37717595/article/details/80603884