Python 规范化LinkedIn用户联系人的职位名

CODE:

#!/usr/bin/python 
# -*- coding: utf-8 -*-

'''
Created on 2014-8-19
@author: guaguastd
@name: job_title_standard.py
'''

import os
import csv
from collections import Counter
from operator import itemgetter
from prettytable import PrettyTable

# specify csv directory
CSV_FILE = os.path.join(r"E:", "\\", "eclipse", "LinkedIn", "dfile", "my_connections.csv")

# define a set of transforms that converts the first item
# to the second item
transforms = [
    ('Sr.', 'Senior'),
    ('Sr', 'Senior'),
    ('Jr.', 'Junior'),
    ('Jr', 'Junior'),
    ('CEO', 'Chief Executive Officer'),
    ('COO', 'Chief Operating Officer'),
    ('CTO', 'Chief Technology Officer'),
    ('CFO', 'Chief Finance Officer'),
    ('VP', 'Vice President'),
]

csvReader = csv.DictReader(open(CSV_FILE), delimiter=',', quotechar='"')
contacts = [row for row in csvReader]

# Read in a list of titles and split 
# apart any combined titles like "President/CEO."
# "President & CEO", "President and CEO"
titles = []
for contact in contacts:
    titles.extend([t.strip() for t in contact['Job Title'].split('/')
                  if contact['Job Title'].strip() != ''])

# Replace common/known abbreviations
for i, _ in enumerate(titles):
    for transform in transforms:
        titles[i] = titles[i].replace(*transform)

# Print out a table of titles sorted by frequency
pt = PrettyTable(field_names=['Title', 'Freq'])
pt.align = 'l'
c = Counter(titles)
[pt.add_row([title, freq])
for (title, freq) in sorted(c.items(), key=itemgetter(1), reverse=True)
    if freq > 0]
print pt

# Print out a table of tokens sorted by frequency
tokens = []
for title in titles:
    tokens.extend([t.strip(',') for t in title.split()])
pt = PrettyTable(field_names=['Token', 'Freq'])
pt.align = 'l'
c = Counter(tokens)
[pt.add_row([token, freq])
for (token, freq) in sorted(c.items(), key=itemgetter(1), reverse=True)
    if freq > 0 and len(token) > 2]
print pt

RESULT:

+-----------------------------------+------+
| Title                             | Freq |
+-----------------------------------+------+
| Senior Software Developer         | 1    |
| Sales Manager                     | 1    |
| Software Manager                  | 1    |
| Online Marketing Manager          | 1    |
| Senior Consultant                 | 1    |
| Chief Executive Officer & Founder | 1    |
| Director                          | 1    |
| S                                 | 1    |
| Student                           | 1    |
| Senior Software Engineer          | 1    |
| ???                               | 1    |
+-----------------------------------+------+
+------------+------+
| Token      | Freq |
+------------+------+
| Manager    | 3    |
| Senior     | 3    |
| Software   | 3    |
| Marketing  | 1    |
| Founder    | 1    |
| Consultant | 1    |
| Executive  | 1    |
| Sales      | 1    |
| Developer  | 1    |
| Director   | 1    |
| Chief      | 1    |
| Officer    | 1    |
| Student    | 1    |
| Online     | 1    |
| ???        | 1    |
| Engineer   | 1    |
+------------+------+


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值