用正则表达式

# encoding: utf-8
import re
import pandas as pd
import os
from sqlalchemy import create_engine
engine = create_engine('oracle://username:password@192.168.114.522:1521/dbname')
def getProdname(url):
    f = open(url, 'r', encoding='gb18030')
    v_par = re.compile(r'<div class="info-wrapper">.*?target="_blank">(.*?)</a><p')  # 商品信息
    # v_par2 = re.compile(r'<a href="//beibeijia.tmall.com" target="_blank">(.*?)</a></td><td') #店铺名称
    # print(f.read())
    res = re.findall(v_par, f.read())
    return(res)


def getStoreName(url):
    f = open(url, 'r', encoding='gb18030')
    v_par = re.compile(r'</div></div></td><td type="raw" class="raw">.*?target="_blank">(.*?)</a></td><td')  # 商品信息
    # print(f.read())
    res = re.findall(v_par, f.read())
    # SmallCookie(res)
    return res


def SmallCookie(li):
    vList =[]
    for i in li:
        vList.append(li[i]+"_"+i)
    for i in range(1,len(vList)):
        print("rename "+str(i)+".txt "+vList[i-1]+"_%DATE:~0,4%%DATE:~5,2%%DATE:~8,2%.txt")
# Top10流量来源 中文字
def getTop1(url):
    f = open(url, 'r', encoding='gb18030')
    vstr = f.read()
    beg = (getStrNum(vstr, 'Top10流量来源'))
    end = (getStrNum(vstr, 'Top10引流关键词'))
    vtext = (vstr[beg:end])
    v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
    res = re.findall(v_par,vtext)
    return res


# Top10引流关键词 中文字
def getTop2(url):
    f = open(url, 'r', encoding='gb18030')
    vstr = f.read()
    beg = (getStrNum(vstr, 'Top10引流关键词'))
    end = (getStrNum(vstr, 'Top10成交关键词'))
    vtext = (vstr[beg:end])
    v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
    res = re.findall(v_par,vtext)
    return res


# Top10成交关键词 中文字
def getTop3(url):
    f = open(url, 'r', encoding='gb18030')
    vstr = f.read()
    beg = (getStrNum(vstr, 'Top10成交关键词'))
    end = (getStrNum(vstr, '关联购买'))
    vtext = (vstr[beg:end])
    v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
    res = re.findall(v_par,vtext)
    return res


# Top10关联购买的排名 暂时不用
def getTop4(url):
    f = open(url, 'r', encoding='gb18030')
    vstr = f.read()
    beg = (getStrNum(vstr, '关联购买'))
    vtext = (vstr[beg:])
    v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
    res = re.findall(v_par,vtext)
    return res
# *******************************上面只用于截取中文字符**********************************************************************************************
# def getTop(url):
#     f = open(url, 'r', encoding='gb18030')
#     v_par = re.compile('<td type="raw" class="raw">(.*?)</td><td')
#     res = re.findall(v_par,f.read())
#     return res


def getStrNum(vstr,vfindstr):
    return vstr.rfind(vfindstr)
# 取流量来源数字
def getTopNum(url):
    f = open(url, 'r', encoding='gb18030')
    vstr = f.read()
    beg =(getStrNum(vstr,'Top10流量来源'))
    end =(getStrNum(vstr,'Top10引流关键词'))
    vtext =(vstr[beg:end])
    v_par = re.compile('<td type="num" class="num">(.*?)</td><td')
    res = re.findall(v_par,vtext)
    return (res[::2])
#取Top10引流关键词的数字
def getTopNum002(url):
    f = open(url, 'r', encoding='gb18030')
    vstr = f.read()
    beg =(getStrNum(vstr,'Top10引流关键词'))
    end =(getStrNum(vstr,'Top10成交关键词'))
    vtext =(vstr[beg:end])
    v_par = re.compile('</td><td value="(.*?)" width="')
    res = re.findall(v_par,vtext)
    return (res)


#取Top10成交关键词 的数字
def getTopNum003(url):
    f = open(url, 'r', encoding='gb18030')
    vstr = f.read()
    beg =(getStrNum(vstr,'Top10成交关键词'))
    end =(getStrNum(vstr,'关联购买'))
    vtext =(vstr[beg:end])
    v_par = re.compile('</td><td value="(.*?)" width="')
    res = re.findall(v_par,vtext)
    return (res)




def getTopProdInfo(url):
    f = open(url, 'r', encoding='gb18030')
    v_par = re.compile('</div><div class="info-wrapper">.*?target="_blank">(.*?)</a><p class=')
    res = re.findall(v_par,f.read())
    return res
# # 引流关键词数字
# def getTopNum2(url):
#     f = open(url, 'r', encoding='gb18030')
#     v_par = re.compile('</td><td value="(.*?)" width="')
#     res = re.findall(v_par, f.read())
#     return res
#
# # 成交关键词数字
# def getTopNum3(url):
#     f = open(url, 'r', encoding='gb18030')
#     v_par = re.compile('</td><td value="(.*?)" width="')
#     res = re.findall(v_par, f.read())
#     return res
#第一步先将店名和产品名称放在文件名上
def executeMain1(url):
    vProdName = getProdname(url)
    vStoreName = getStoreName(url)
    SmallCookie(dict(zip(vProdName, vStoreName)))
# 第二步,读取文件夹下的所有文件,将文件和内容做成Dataform
def executeMainDetail(url):
    #流量来源 访客数
    vtopnum = getTopNum(url)
    v_top = getTop1(url)
    # Top10引流关键词
    tmptopnum = getTopNum002(url)
    tmp_top =getTop2(url)
    # Top10成交关键词
    tmptopnum01 = getTopNum003(url)
    tmp_top01 = getTop3(url)


    data ={
        "vtop":v_top,
        "vtopnum":vtopnum
        }
    df =pd.DataFrame(data)
    df['storename']=url.split('_')[1].replace('file/tmp2/detail/','')
    df['prodname']=url.split('_')[2]
    df['crdt']=url.split('_')[3].replace('.txt','')
    df['type']='流量来源'


    data2 = {
        "vtop": tmp_top,
        "vtopnum": tmptopnum
    }
    df2 = pd.DataFrame(data2)
    df2['storename'] = url.split('_')[1].replace('file/tmp2/detail/', '')
    df2['prodname'] = url.split('_')[2]
    df2['crdt'] = url.split('_')[3].replace('.txt', '')
    df2['type'] = '引流关键词'


    data3 = {
        "vtop": tmp_top01,
        "vtopnum": tmptopnum01
    }
    df3 = pd.DataFrame(data3)
    df3['storename'] = url.split('_')[1].replace('file/tmp2/detail/', '')
    df3['prodname'] = url.split('_')[2]
    df3['crdt'] = url.split('_')[3].replace('.txt', '')
    df3['type'] = '成交关键词'




    frames = [df, df2,df3]
    result = pd.concat(frames)
    df.to_sql('taobaostore', engine, if_exists='append', index=None)
    return result


def executeMain2(v_urlDetail):
    for root, dirs, files in os.walk(v_urlDetail):
        for file in files:
            # print(urlDetail + file)
            executeMainDetail(v_urlDetail + file)






def executeMain_pre(v_urlDetail):
    res =getTopProdInfo(v_urlDetail)
    data ={
        "res":res
    }
    df =pd.DataFrame(data)
    df['storename'] = v_urlDetail.split('_')[1].replace('file/tmp2/detail/', '')
    df['prodname'] = v_urlDetail.split('_')[2]
    df['crdt'] = v_urlDetail.split('_')[3].replace('.txt', '')
    df.to_sql('buyproduct', engine, if_exists='append', index=None)




def executeMain3(v_urlDetail):
    for root, dirs, files in os.walk(v_urlDetail):
        for file in files:
            # print(urlDetail + file)
            executeMain_pre(v_urlDetail + file)


if __name__ == '__main__':
    url='C:/Users/lusheng/Desktop/download_file/tmp2/main.txt'
    urlDetail = 'C:/Users/lusheng/Desktop/download_file/tmp2/detail/'
    # executeMain1(url)
    # executeMain2(urlDetail)
    executeMain3(urlDetail)
    # 测试
    # res = executeMainDetail(urlDetail+"君来康医疗器材商城_日本中山式背背佳成人驼背矫正带女透气儿童纠正带学生脊椎矫正衣_20180109.txt")
    # print(res)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值