将爬取到的字符串通过删除各种乱码后，写入EXCEL._爬虫后excel删除其中的乱码-CSDN博客

本文链接：https://blog.csdn.net/newnotes/article/details/106001337

修改第三版：基本实现需要的功能，将爬取到的字符串通过删除各种乱码后，写入EXCEL.
1、格式化字符内容
2、将格式化内容写入EXCEL


from openpyxl import Workbook
from pandas import DataFrame
import pandas as pd
str = '生产经营场所地址：江西省宜春市袁州区渥江镇湾田村 行业类别：猪的饲养 所在地区：江西省-宜春市-袁州区 发证机关：宜春市生态环境局'
print(str)
#print(str.split('：',8))

u1,u2,u3,u4,u5 = str.split('：',4)

print(u1)#生产经营场所地址
#print(u2)#江西省宜春市袁州区渥江镇湾田村 行业类别
#print(u3) #猪的饲养 所在地区
#print(u4)#江西省-宜春市-袁州区 发证机关
#print(u5)#宜春市生态环境局
f1=u2.split(' ') #['江西省宜春市袁州区渥江镇湾田村', '行业类别']  字符串已变成列表属性
f2=f1             #['江西省宜春市袁州区渥江镇湾田村', '行业类别']  字符串已变成列表属性
#print('.................................................')
f1=f1[0:1]   #['江西省宜春市袁州区渥江镇湾田村']
f2=f2[1:]    #['行业类别']
print(f1)
print(f2)
print('.................................................')
g1=u3.split(' ')#['猪的饲养', '所在地区']
g2=g1
g1=g1[0:1]   #['猪的饲养']
g2=g2[1:]    #['所在地区']
print(g1)
print(g2)
print('.................................................')
h1=u4.split(' ')#['猪的饲养', '所在地区']
h2=h1
h1=h1[0:1]   #['江西省-宜春市-袁州区']
h2=h2[1:]    #['发证机关']
#h1=str(h1)
print(h1)
print(h2)
#a=str.find('：',4)
#print(str[0:a])
#b=str[0:a]
#c=str.find(' ')
#d=str[0:c]
#a=str.split(' ')
#a=str.find('：',4)
#print(a)
# -*- coding: utf-8 -*-

wb = Workbook()    #创建文件对象

# grab the active worksheet
ws = wb.active     #获取第一个sheet
cell=ws["A1"]
cell.value=u1
cell=ws['A2']
cell.value=f1[0]
cell=ws["B1"]
cell.value=f2[0]
cell=ws['B2']
cell.value=g1[0]
cell=ws["C1"]
cell.value=g2[0]
cell=ws['C2']
cell.value=h1[0]
cell=ws["D1"]
cell.value=h2[0]
cell=ws['D2']
cell.value=u5


wb.save("d:\\sample.xlsx")

str = '生产经营场所地址：江西省宜春市袁州区渥江镇湾田村 行业类别：猪的饲养 所在地区：江西省-宜春市-袁州区 发证机关：宜春市生态环境局'
print(type(str))
#a=str.find('：',4)
#print(str[0:a])
#b=str[0:a]
#c=str.find(' ')
#d=str[0:c]
a=str.split(' ')
#a=str.find('：',4)
print(a)
flg = 0
i=4

#rep_res6 =str.replace('：','Hello')
#print(rep_res6)


b=str.find('：')  #找到字符串中第1个‘：’的位置
c=str[0:b]
#c=c.replace('：',' ')
d=str.find('', '')  #找到字符串中第1个‘, ’的位置
e=str[b+1:d]
print(c)
print(d)在这里插入代码片

# encoding:utf-8
import pandas as pd
import openpyxl



wk = openpyxl.load_workbook(r"t.xlsx")                              # 读取本地的excel
wk_name = wk.sheetnames                                             # 读取这个表格所有的sheet 名称
wk_sheet = wk[wk_name[0]]                                           # 获取这个表格的第一个 sheet
print(wk_sheet)
content_A1=wk_sheet.cell(row=1,column=1).value.replace("\r\n", "")   #对HTML爬取数据进行特殊字符第1次替换
content_A2=content_A1.replace("    			", " ")                 #将替换后的数据再次将多个空格（内含不知道什么字符，直接进行COPY的）替换成一个空格
content_A1=content_A2.strip().replace('   			',' ')          #将替换后的数据再次将多个空格（内含不知道什么字符，直接进行COPY的）替换成一个空格
#wk_sheet.cell(row=1,column=1).value=content_A1
kongge=content_A1.split(' ')
print(type(content_A1))
fenhao=str(kongge)
fenhao=fenhao.split('：')
print(fenhao)
#print(fenhao)
#  将数据写入新文件
# 将数据写入第 i 行，第 j 列
#sheet1 = wk.add_sheet(u'sheet1',cell_overwrite_ok=True)
flg = 0
for lines in last_Data:
    flg +=1
    for i in range(len(lines)):
        ws.cell(flg,i+1,lines[i])
    workbook.save("2.xlsx")

wk.save(r"tt.xlsx")  # 保存为s.xlsx

#wk_sheet.unmerge_cells('a1:b1')  # 拆分 b1:E2区域单元格