利用特征词进行用户分类

一 、项目概述:根据需求方提供的特征词,匹配F内容含有特征词的用户。

二 、项目代码

#coding:utf-8
import jieba
import re
from stop_words import stop_word
import pymysql as mydb
import pandas as pd
from selenium import webdriver
import time
import os
import warnings
warnings.filterwarnings('ignore')
#连接数据库读入数据
db = mydb.connect(host='11.111.111.11', port=XXXX, user='XX', passwd='XX', db='XX', charset='utf8')
sql_cmd = "select a.url, a.title, a.ori, a.ori_from, case a.user when '' then '无用户名' else a.user end, b.index, a.time, b.yes_no from content a, ba b where a.id = b.id and (b.age+0)< 5"
data_set = pd.read_sql(sql_cmd, db)
db.close()
#分词匹配关键词
titles = data_set['title'].values
contens = data_set['ori'].values
key_word = pd.read_csv('key_word.csv',header = None)
key_words = key_word[0].values
jieba.load_userdict("key_word.csv")
for i in range (0,len(titles)):
    print (i)
    tz = titles[i] + contens[i]
    string = re.sub("\d+|uid|Name|content|dtype|object|[\]\[\:\...\:\.\!\,\,\…\~\。\-\─\*\—\”\《\》]|[\/\?\?\、\~\】\【\(\)\)\__\____]", "",tz )
    tz_cut = ''.join(string.split())
    seg_list = jieba.lcut(tz_cut,cut_all=False)
    seg_list = [word for word in seg_list if word not in stop_word]
    #seg_list = ' '.join(i for i in seg_list)
    print (seg_list)
    for w in seg_list:
        j = 0
        if w in key_words:
            j += 1
        if j == 1:
            print (j)
            data_set.ix[i,7] = '1'
            break
#1的人员
user_1 = data_set[data_set['yes_no']== '1']
print(len(user_du))
#非1人员
user_0 = data_set[data_set['yes_no']=='']
print(len(user_0))
#将1人员按照F排名,得到排名用户表
user_sort = user_1['user'].value_counts()
user_sorts = pd.DataFrame(data = user_sort.values, columns = ['f_num'],index = user_sort.index)
user_sorts.head()
#按排序表的顺序将原用户排名表排序
result = user_1.join(user_sorts, on='user')
result.head()
result = result.sort_values(by = ['f_num', 'user'], ascending = False).reset_index(drop=True)
#将用户的分类表写入excel
now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time()))
writer = pd.ExcelWriter('data/'+ 'match_after.xlsx')
result.to_excel(writer, sheet_name='user_1', index_label = '序号')
user_0.to_excel(writer, sheet_name='user_0', index_label = '序号')
writer.save()
#将符合条件的用户主页截图
user_table = result[result['f_num']>= 5][[ID','index']].drop_duplicates().reset_index(drop=True)
brower = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1-windows\bin\phantomjs.exe")
os.makedirs('image/'+ now) 
for i in range (0,len(user_table)):
    url = user_table.ix[i,1]
    user_a= user_table.ix[i,0]
    brower.get(url)
    brower.maximize_window()
    picName = 'image/'+ now + '/'+ user_a+ ".jpg"
    #print (picName)
    brower.save_screenshot(picName)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值