用python将word文档导入数据库_python读取word文档,插入mysql数据库实例

表格内容如下:

1、实现批量导入word文档,取文档标题中的数字作为编号

2、除取上面打钩的内容需要匹配出来入库入库,其他内容全部直接入库mysql

# wuyanfeng

# -*- coding:utf-8 -*-

# 读取docx中的文本代码示例

import docx

import pymysql

import re

import os

# 创建数据库链接

conn = pymysql.connect(

host='rm-bp1vu5d84dg12c6d59o.mysql.rds.aliyuncs.com',

port=3306,

user='root',

passwd='wYf092415*',

db='pays',

charset='utf8',

)

# 创建游标

cursor = conn.cursor()

#切片函数

def section(info,key,len11):

a = len(info)

print(a, type(a))

d = []

e = 0

g = -1

i = 0

task_class=[]

while i < len(info):

# for i in range(len(info)):

# i+=1

print("i::::", i)

try:

#c = info.index("a", e)

#print("c:::::", c)

c = info.index(key, e)

#print("c:::::", c)

print("c类型判断",type(c))

except ValueError:

print(ValueError)

try:

if (c != '') & (g < int(c)):

d.append(c)

g = c

i = c + 1

print("illlldddd:", i)

e = c + 1

continue

elif (c == ''):

break

except UnboundLocalError:

print(UnboundLocalError)

return task_class

break

print("d", d, type(d))

print(d[0], type(d[0]))

print("d的长度:",len(d))

#开始切片

if len(d) != 0:

for j in range(len(d)):

print("info11:::", info, type(info))

info = ''.join(info)

print("info222:::",info,type(info))

print("d[%d]"%j,d[j])

#print("d[j]:5"%j,info[d[j]:5])

llll = info[d[j]+1:d[j]+5]

print("d[%d]:5" % j, llll)

task_class.append(llll)

print("task_class::11", task_class)

task_class=",".join(task_class)

print("str1112222",task_class)

return task_class

def insettable(file):

print("file::::::::::::::::::::", file)

print("type::::::::::::::::::::", type(file))

# file1 = file

# file1 = str(file1)

ddd = re.findall("知识库\\\(\d+)", file)

print("ddd:::::::::::", ddd)

print("ddd[0]:::", ddd[0])

ddd = int(ddd[0])

print("ddd::::", type(ddd))

file = docx.Document(file)

# 读取表格:

t = file.tables[0]

print(t)

print("1:", t.cell(0, 0).text) # 1

cell1 = t.cell(0, 0).text

print("tyep::::", type(t.cell(0, 0).text))

print("2:", t.cell(0, 1).text) # 2

cell2 = t.cell(0, 1).text

print("2:", t.cell(0, 2).text) # 2

cell3 = t.cell(0, 2).text

print("2:", t.cell(0, 3).text) # 2

cell4 = t.cell(0, 3).text

print("cell4:::::::::", cell4)

print("3:", t.cell(1, 0).text) # 3

cell5 = t.cell(1, 0).text

print("4:", t.cell(1, 2).text) # 4

cell6 = t.cell(1, 2).text

print("5:", t.cell(1, 3).text) # 5

task_type = t.cell(1, 3).text

# task_type = re.findall('.*[☑√](.*)$', cell7)

# task_type = ''.join(cell7)

print("task_type111111:", task_type)

# task_class = task_class[0:4]

'''低级处理方式

a = int(task_type.count("☑"))

print("a|||||||", a, type(a))

b = int(task_type.count("√"))

print("b|||||||", b, type(a))

if (a == 1) | (b == 1):

print("111111111111111111")

# task_type = re.findall('.*[☑√](.*)$', task_type)

task_type = re.findall('.*[☑√](.*)$', task_type)

print("task_type1", task_type)

task_type = ''.join(task_type)

print("task_type2", task_type)

task_type = task_type[0:4]

print("task_type3:d:%s,b=%d" % (a, b), task_type)

elif (a == 0) & (b == 0):

print("2222222222222222222")

task_type = '法定职责'

print("a:%s,b=%s" % (a, b), task_type)

elif (a == 2) | (b == 2):

print("333333333333333333333")

task_type = '法定职责,工作职责 '

print("a:%s,b=%s" % (a, b), task_type)

'''

#调用切片函数

task_type1 = section(task_type, "√", 4)

task_type2 = section(task_type, "☑", 4)

task_type1 = "".join(task_type1)

task_type2 = "".join(task_type2)

print("task_type1:::", task_type1,type(task_type1))

print("task_type2:::", task_type2,type(task_type2))

if task_type1.strip()!="":

task_type = task_type1

print("task_type111:::", task_type1)

elif task_type2.strip()!="":

task_type = task_type2

print("task_type222:::", task_type2)

print("6:", t.cell(1, 4).text) # 6

cell8 = t.cell(1, 4).text

print("7:", t.cell(2, 1).text) # 7

cell9 = t.cell(2, 1).text

# 获取文档对象

# file = docx.Document("D:\\配置库\\公案APP\\1.2 系统规格\\知识库\\14人员死亡先期处置.docx")

print("段落数:" + str(len(file.paragraphs))) # 段落数为13,每个回车隔离一段

lenn = len(file.paragraphs)

print("len:", lenn)

# 输出每一段的内容

for para in file.paragraphs:

print(para.text)

# 输出段落编号及段落内容

for i in range(len(file.paragraphs)):

print("第" + str(i) + "段的内容是:" + file.paragraphs[i].text)

list6 = []

for i in range(len(file.paragraphs)):

if 0 == i:

print("i:", i)

lis0 = file.paragraphs[i].text

print("list0:", lis0)

print(type(lis0))

elif 1 == i:

print("i:", i)

task_class = file.paragraphs[i].text

print("lis1", task_class,type(task_class))

'''低级处理方式

print("task_class111111:", task_class)

c = int(task_class.count("☑"))

task_class = ''.join(task_class)

#print(task_class.index('☑'))

print("c|||||||", c, type(c))

d = int(task_class.count("√"))

print(task_class.index('√'))

print("d|||||||", d, type(d))

task_class = re.findall(r'[☑√](?:.*)', task_class)

task_class = ''.join(task_class)

task_class = task_class[1:5]

print("task_class", task_class)

'''

#调用切片函数

task_class1 = section(task_class, "√", 4)

task_class2 = section(task_class, "☑", 4)

task_class1 = "".join(task_class1)

task_class2 = "".join(task_class2)

print("task_class1:::", task_class1,type(task_class1))

print("task_class2:::", task_class2,type(task_class2))

if task_class1.strip()!="":

task_class = task_class1

print("task_class11:::", task_class1)

elif task_class2.strip()!="":

task_class = task_class2

print("task_class22:::", task_class2)

if 2 == i:

print("i:", i)

lis2 = file.paragraphs[i].text

print("lis2", lis2)

print(type(lis2))

preparer = re.findall('填表单位:(.*?)$', lis2)

preparer = ''.join(preparer)

print("preparer:%s" % preparer)

# elif 3 == i:

# print("i:", i)

# lis3 = file.paragraphs[i].text

elif 3 == i:

print("i:", i)

lis4 = file.paragraphs[i].text

print("lis4", lis4)

print(type(lis4))

elif 3 < i < lenn - 1:

print("i:", i)

print(file.paragraphs[i].text)

print(type(file.paragraphs[i].text))

# list6[i-5] = list6.append(file.paragraphs[i].text)

list6.append(str(file.paragraphs[i].text).strip('\xa0'))

# list6.append("%s\n" % str(file.paragraphs[i].text).strip('\xa0'))

print(list6)

key_steps = "\n".join(list6)

# print("key_steps:\n",key_steps.strip('\n'))

cursor.execute(

"insert into `t_knowledge_base` (`no`, `preparer`, `task_class`, `task_name`, `task_specification`, `task_type`, `task_desc`, `task_basis`, `key_steps`) values ('%d','%s','%s','%s','%s','%s','%s',NULL,'%s')" % (

ddd, preparer, task_class, cell2, cell4, task_type, cell9, key_steps))

conn.commit()

def traverse(f):

fs = os.listdir(f)

for f1 in fs:

tmp_path = os.path.join(f, f1)

if not os.path.isdir(tmp_path):

print('文件: %s' % tmp_path)

insettable(tmp_path)

else:

print('文件夹:%s' % tmp_path)

traverse(tmp_path)

path = 'D:\\配置库\公案APP\\1.2 系统规格\\知识库'

traverse(path)

#单文件调测

# path = 'D:\\配置库\\公案APP\\1.2 系统规格\\知识库\\14人员死亡先期处置.docx'

# insettable(path)

# 关闭游标

cursor.close()

# 关闭连接

conn.close()

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值