# encoding: utf-8
import re
import pandas as pd
import os
from sqlalchemy import create_engine
engine = create_engine('oracle://username:password@192.168.114.522:1521/dbname')
def getProdname(url):
f = open(url, 'r', encoding='gb18030')
v_par = re.compile(r'<div class="info-wrapper">.*?target="_blank">(.*?)</a><p') # 商品信息
# v_par2 = re.compile(r'<a href="//beibeijia.tmall.com" target="_blank">(.*?)</a></td><td') #店铺名称
# print(f.read())
res = re.findall(v_par, f.read())
return(res)
def getStoreName(url):
f = open(url, 'r', encoding='gb18030')
v_par = re.compile(r'</div></div></td><td type="raw" class="raw">.*?target="_blank">(.*?)</a></td><td') # 商品信息
# print(f.read())
res = re.findall(v_par, f.read())
# SmallCookie(res)
return res
def SmallCookie(li):
vList =[]
for i in li:
vList.append(li[i]+"_"+i)
for i in range(1,len(vList)):
print("rename "+str(i)+".txt "+vList[i-1]+"_%DATE:~0,4%%DATE:~5,2%%DATE:~8,2%.txt")
# Top10流量来源 中文字
def getTop1(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg = (getStrNum(vstr, 'Top10流量来源'))
end = (getStrNum(vstr, 'Top10引流关键词'))
vtext = (vstr[beg:end])
v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
res = re.findall(v_par,vtext)
return res
# Top10引流关键词 中文字
def getTop2(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg = (getStrNum(vstr, 'Top10引流关键词'))
end = (getStrNum(vstr, 'Top10成交关键词'))
vtext = (vstr[beg:end])
v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
res = re.findall(v_par,vtext)
return res
# Top10成交关键词 中文字
def getTop3(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg = (getStrNum(vstr, 'Top10成交关键词'))
end = (getStrNum(vstr, '关联购买'))
vtext = (vstr[beg:end])
v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
res = re.findall(v_par,vtext)
return res
# Top10关联购买的排名 暂时不用
def getTop4(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg = (getStrNum(vstr, '关联购买'))
vtext = (vstr[beg:])
v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
res = re.findall(v_par,vtext)
return res
# *******************************上面只用于截取中文字符**********************************************************************************************
# def getTop(url):
# f = open(url, 'r', encoding='gb18030')
# v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
# res = re.findall(v_par,f.read())
# return res
def getStrNum(vstr,vfindstr):
return vstr.rfind(vfindstr)
# 取流量来源数字
def getTopNum(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg =(getStrNum(vstr,'Top10流量来源'))
end =(getStrNum(vstr,'Top10引流关键词'))
vtext =(vstr[beg:end])
v_par = re.compile('<td type="num" class="num">(.*?)</td><td')
res = re.findall(v_par,vtext)
return (res[::2])
#取Top10引流关键词的数字
def getTopNum002(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg =(getStrNum(vstr,'Top10引流关键词'))
end =(getStrNum(vstr,'Top10成交关键词'))
vtext =(vstr[beg:end])
v_par = re.compile('</td><td value="(.*?)" width="')
res = re.findall(v_par,vtext)
return (res)
#取Top10成交关键词 的数字
def getTopNum003(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg =(getStrNum(vstr,'Top10成交关键词'))
end =(getStrNum(vstr,'关联购买'))
vtext =(vstr[beg:end])
v_par = re.compile('</td><td value="(.*?)" width="')
res = re.findall(v_par,vtext)
return (res)
def getTopProdInfo(url):
f = open(url, 'r', encoding='gb18030')
v_par = re.compile('</div><div class="info-wrapper">.*?target="_blank">(.*?)</a><p class=')
res = re.findall(v_par,f.read())
return res
# # 引流关键词数字
# def getTopNum2(url):
# f = open(url, 'r', encoding='gb18030')
# v_par = re.compile('</td><td value="(.*?)" width="')
# res = re.findall(v_par, f.read())
# return res
#
# # 成交关键词数字
# def getTopNum3(url):
# f = open(url, 'r', encoding='gb18030')
# v_par = re.compile('</td><td value="(.*?)" width="')
# res = re.findall(v_par, f.read())
# return res
#第一步先将店名和产品名称放在文件名上
def executeMain1(url):
vProdName = getProdname(url)
vStoreName = getStoreName(url)
SmallCookie(dict(zip(vProdName, vStoreName)))
# 第二步,读取文件夹下的所有文件,将文件和内容做成Dataform
def executeMainDetail(url):
#流量来源 访客数
vtopnum = getTopNum(url)
v_top = getTop1(url)
# Top10引流关键词
tmptopnum = getTopNum002(url)
tmp_top =getTop2(url)
# Top10成交关键词
tmptopnum01 = getTopNum003(url)
tmp_top01 = getTop3(url)
data ={
"vtop":v_top,
"vtopnum":vtopnum
}
df =pd.DataFrame(data)
df['storename']=url.split('_')[1].replace('file/tmp2/detail/','')
df['prodname']=url.split('_')[2]
df['crdt']=url.split('_')[3].replace('.txt','')
df['type']='流量来源'
data2 = {
"vtop": tmp_top,
"vtopnum": tmptopnum
}
df2 = pd.DataFrame(data2)
df2['storename'] = url.split('_')[1].replace('file/tmp2/detail/', '')
df2['prodname'] = url.split('_')[2]
df2['crdt'] = url.split('_')[3].replace('.txt', '')
df2['type'] = '引流关键词'
data3 = {
"vtop": tmp_top01,
"vtopnum": tmptopnum01
}
df3 = pd.DataFrame(data3)
df3['storename'] = url.split('_')[1].replace('file/tmp2/detail/', '')
df3['prodname'] = url.split('_')[2]
df3['crdt'] = url.split('_')[3].replace('.txt', '')
df3['type'] = '成交关键词'
frames = [df, df2,df3]
result = pd.concat(frames)
df.to_sql('taobaostore', engine, if_exists='append', index=None)
return result
def executeMain2(v_urlDetail):
for root, dirs, files in os.walk(v_urlDetail):
for file in files:
# print(urlDetail + file)
executeMainDetail(v_urlDetail + file)
def executeMain_pre(v_urlDetail):
res =getTopProdInfo(v_urlDetail)
data ={
"res":res
}
df =pd.DataFrame(data)
df['storename'] = v_urlDetail.split('_')[1].replace('file/tmp2/detail/', '')
df['prodname'] = v_urlDetail.split('_')[2]
df['crdt'] = v_urlDetail.split('_')[3].replace('.txt', '')
df.to_sql('buyproduct', engine, if_exists='append', index=None)
def executeMain3(v_urlDetail):
for root, dirs, files in os.walk(v_urlDetail):
for file in files:
# print(urlDetail + file)
executeMain_pre(v_urlDetail + file)
if __name__ == '__main__':
url='C:/Users/lusheng/Desktop/download_file/tmp2/main.txt'
urlDetail = 'C:/Users/lusheng/Desktop/download_file/tmp2/detail/'
# executeMain1(url)
# executeMain2(urlDetail)
executeMain3(urlDetail)
# 测试
# res = executeMainDetail(urlDetail+"君来康医疗器材商城_日本中山式背背佳成人驼背矫正带女透气儿童纠正带学生脊椎矫正衣_20180109.txt")
# print(res)
import re
import pandas as pd
import os
from sqlalchemy import create_engine
engine = create_engine('oracle://username:password@192.168.114.522:1521/dbname')
def getProdname(url):
f = open(url, 'r', encoding='gb18030')
v_par = re.compile(r'<div class="info-wrapper">.*?target="_blank">(.*?)</a><p') # 商品信息
# v_par2 = re.compile(r'<a href="//beibeijia.tmall.com" target="_blank">(.*?)</a></td><td') #店铺名称
# print(f.read())
res = re.findall(v_par, f.read())
return(res)
def getStoreName(url):
f = open(url, 'r', encoding='gb18030')
v_par = re.compile(r'</div></div></td><td type="raw" class="raw">.*?target="_blank">(.*?)</a></td><td') # 商品信息
# print(f.read())
res = re.findall(v_par, f.read())
# SmallCookie(res)
return res
def SmallCookie(li):
vList =[]
for i in li:
vList.append(li[i]+"_"+i)
for i in range(1,len(vList)):
print("rename "+str(i)+".txt "+vList[i-1]+"_%DATE:~0,4%%DATE:~5,2%%DATE:~8,2%.txt")
# Top10流量来源 中文字
def getTop1(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg = (getStrNum(vstr, 'Top10流量来源'))
end = (getStrNum(vstr, 'Top10引流关键词'))
vtext = (vstr[beg:end])
v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
res = re.findall(v_par,vtext)
return res
# Top10引流关键词 中文字
def getTop2(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg = (getStrNum(vstr, 'Top10引流关键词'))
end = (getStrNum(vstr, 'Top10成交关键词'))
vtext = (vstr[beg:end])
v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
res = re.findall(v_par,vtext)
return res
# Top10成交关键词 中文字
def getTop3(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg = (getStrNum(vstr, 'Top10成交关键词'))
end = (getStrNum(vstr, '关联购买'))
vtext = (vstr[beg:end])
v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
res = re.findall(v_par,vtext)
return res
# Top10关联购买的排名 暂时不用
def getTop4(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg = (getStrNum(vstr, '关联购买'))
vtext = (vstr[beg:])
v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
res = re.findall(v_par,vtext)
return res
# *******************************上面只用于截取中文字符**********************************************************************************************
# def getTop(url):
# f = open(url, 'r', encoding='gb18030')
# v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
# res = re.findall(v_par,f.read())
# return res
def getStrNum(vstr,vfindstr):
return vstr.rfind(vfindstr)
# 取流量来源数字
def getTopNum(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg =(getStrNum(vstr,'Top10流量来源'))
end =(getStrNum(vstr,'Top10引流关键词'))
vtext =(vstr[beg:end])
v_par = re.compile('<td type="num" class="num">(.*?)</td><td')
res = re.findall(v_par,vtext)
return (res[::2])
#取Top10引流关键词的数字
def getTopNum002(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg =(getStrNum(vstr,'Top10引流关键词'))
end =(getStrNum(vstr,'Top10成交关键词'))
vtext =(vstr[beg:end])
v_par = re.compile('</td><td value="(.*?)" width="')
res = re.findall(v_par,vtext)
return (res)
#取Top10成交关键词 的数字
def getTopNum003(url):
f = open(url, 'r', encoding='gb18030')
vstr = f.read()
beg =(getStrNum(vstr,'Top10成交关键词'))
end =(getStrNum(vstr,'关联购买'))
vtext =(vstr[beg:end])
v_par = re.compile('</td><td value="(.*?)" width="')
res = re.findall(v_par,vtext)
return (res)
def getTopProdInfo(url):
f = open(url, 'r', encoding='gb18030')
v_par = re.compile('</div><div class="info-wrapper">.*?target="_blank">(.*?)</a><p class=')
res = re.findall(v_par,f.read())
return res
# # 引流关键词数字
# def getTopNum2(url):
# f = open(url, 'r', encoding='gb18030')
# v_par = re.compile('</td><td value="(.*?)" width="')
# res = re.findall(v_par, f.read())
# return res
#
# # 成交关键词数字
# def getTopNum3(url):
# f = open(url, 'r', encoding='gb18030')
# v_par = re.compile('</td><td value="(.*?)" width="')
# res = re.findall(v_par, f.read())
# return res
#第一步先将店名和产品名称放在文件名上
def executeMain1(url):
vProdName = getProdname(url)
vStoreName = getStoreName(url)
SmallCookie(dict(zip(vProdName, vStoreName)))
# 第二步,读取文件夹下的所有文件,将文件和内容做成Dataform
def executeMainDetail(url):
#流量来源 访客数
vtopnum = getTopNum(url)
v_top = getTop1(url)
# Top10引流关键词
tmptopnum = getTopNum002(url)
tmp_top =getTop2(url)
# Top10成交关键词
tmptopnum01 = getTopNum003(url)
tmp_top01 = getTop3(url)
data ={
"vtop":v_top,
"vtopnum":vtopnum
}
df =pd.DataFrame(data)
df['storename']=url.split('_')[1].replace('file/tmp2/detail/','')
df['prodname']=url.split('_')[2]
df['crdt']=url.split('_')[3].replace('.txt','')
df['type']='流量来源'
data2 = {
"vtop": tmp_top,
"vtopnum": tmptopnum
}
df2 = pd.DataFrame(data2)
df2['storename'] = url.split('_')[1].replace('file/tmp2/detail/', '')
df2['prodname'] = url.split('_')[2]
df2['crdt'] = url.split('_')[3].replace('.txt', '')
df2['type'] = '引流关键词'
data3 = {
"vtop": tmp_top01,
"vtopnum": tmptopnum01
}
df3 = pd.DataFrame(data3)
df3['storename'] = url.split('_')[1].replace('file/tmp2/detail/', '')
df3['prodname'] = url.split('_')[2]
df3['crdt'] = url.split('_')[3].replace('.txt', '')
df3['type'] = '成交关键词'
frames = [df, df2,df3]
result = pd.concat(frames)
df.to_sql('taobaostore', engine, if_exists='append', index=None)
return result
def executeMain2(v_urlDetail):
for root, dirs, files in os.walk(v_urlDetail):
for file in files:
# print(urlDetail + file)
executeMainDetail(v_urlDetail + file)
def executeMain_pre(v_urlDetail):
res =getTopProdInfo(v_urlDetail)
data ={
"res":res
}
df =pd.DataFrame(data)
df['storename'] = v_urlDetail.split('_')[1].replace('file/tmp2/detail/', '')
df['prodname'] = v_urlDetail.split('_')[2]
df['crdt'] = v_urlDetail.split('_')[3].replace('.txt', '')
df.to_sql('buyproduct', engine, if_exists='append', index=None)
def executeMain3(v_urlDetail):
for root, dirs, files in os.walk(v_urlDetail):
for file in files:
# print(urlDetail + file)
executeMain_pre(v_urlDetail + file)
if __name__ == '__main__':
url='C:/Users/lusheng/Desktop/download_file/tmp2/main.txt'
urlDetail = 'C:/Users/lusheng/Desktop/download_file/tmp2/detail/'
# executeMain1(url)
# executeMain2(urlDetail)
executeMain3(urlDetail)
# 测试
# res = executeMainDetail(urlDetail+"君来康医疗器材商城_日本中山式背背佳成人驼背矫正带女透气儿童纠正带学生脊椎矫正衣_20180109.txt")
# print(res)