NLP Python

最新推荐文章于 2024-07-12 16:16:27 发布

并不文艺的小朋友

最新推荐文章于 2024-07-12 16:16:27 发布

阅读量577

点赞数

文章标签： python 自然语言处理机器学习

本文链接：https://blog.csdn.net/Baby_Hippopo/article/details/125067288

版权

NLP Tutorials

# encoding: utf-8
# 输入的信息是已经解析为json格式的简历信息集 这个可以用pdfString文件执行这一部分的逻辑

# Assumptions: 在解析模块能够准确获得用户的姓名 对应岗位 和 简历正文

import pandas as pd
import numpy  as np
import jieba
import json
import pickle

from   collections import Counter

import os
import requests
import sys
import re

# self written
import infoextract
import pdfString
from data import Reference
import Try02

# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from transformers import BertTokenizer, BertModel
import torch
import logging


# This file Asks Json in form [{},{},...,{}]

# Set up paths
FILEPATH = "C:\\Alan .AIA\\Python\\CV_Auto\\data\\Result.csv"
CSV_PATH = "C:\\Alan .AIA\\Python\\CV_Auto\\data"
INFOLIST = ["name", "infotext", "jobs"]
STOPWORDSITE = "https://raw.githubusercontent.com/goto456/stopwords/master/cn_stopwords.txt"


# 机器学习实体
tfidf   = TfidfTransformer()
counter = CountVectorizer(analyzer = 'word')

# 输入简历集
def inputSource (sourcePath):
        filename = sourcePath.split("\\")[-1]
        if   (".json" in filename):
                targetDS  = pd.read_json(sourcePath, encoding = "utf-8")
                targetDS.to_csv(CSV_PATH + filename.split(".")[0] + ".csv", encoding = "utf_8_sig")
        elif (".csv" in filename):
                targetDS = pd.read_csv(sourcePath, encoding = "utf-8")
        return  targetDS

class preExtractor (object):
        # 初始化预解析器
        def __init__ (self, sourceText, filename):
                self.fullText = sourceText
                self.file_dir = filename
                
                # Extract Information
                ansinfo = infoextract.Extractor(file_dir = self.file_dir, file_text = self.fullText, switch = 1).search()
                self.info = { 
                        "name":         ansinfo["user_name"], 
                        "infotext":     self.textCut (content = self.fullText),
                        "jobs":         ansinfo["jobs"]
                }
        
        def textCut (self, content):
                src = self.textwasher(text = content, quit_universal = True), 
                seg = list(jieba.cut(str(src[0]).strip()))
                # 去除停用词
                seg = function.removeStopword(seg)
                # 去除纯数字
                seg = list(filter(lambda x: not str(x).isdigit(), seg))
                
                return seg

        # 中文简历文本清洗和去除停用词
        def textwasher (self, text, quit_universal):
                
                # 去除分行 去除关于友邦保险所额外添加的信息项 quit_universal == True
                if (quit_universal):
                        textL = text.split('\n')
                        count = 0
                        while (count < 7):
                                textL.remove(textL[0])
                                count += 1
                        text0 = " ".join(textL[:-5])
                
                # 文本清洗
                # import spacy
                pattern1 = '[’!"#$%&\'()*+,-./:：:;<=>?@[\\]^_`{|}~]+'
                pattern2 = '\\s+'
                pattern3 = r'[\n|\u3000|\s*$]'
                pattern4 = re.compile(u'[^\s1234567890:：' + '\u4e00-\u9fa5' + 'a-zA-Z]+')
                
                text1 = re.sub(pattern1 + pattern2, '', text0)
                text2 = re.sub(pattern3, '', text1)
                text3 = re.sub(pattern4, '', text2)
                
                return text3

class function ():
        # 对数据集进行解析
        def extractDS (targetDS):
                for key in INFOLIST:
                        targetDS[key] = targetDS["Text"].apply(lambda x: "")
                        
                for index, row in targetDS.iterrows():
                        ansinfo = preExtractor(sourceText = row["Text"], filename = row["File_name"]).info
                        for key in ansinfo:
                                targetDS.loc[index, key] = str(ansinfo[key])

        # 从网站上导入停用词
        def getSiteStopword ():
                if not os.path.exists('data/stopWord.json'):
                        stopWord = requests.get(STOPWORDSITE)
                        with open("data/stopWord.json", "wb") as f:
                                f.write(stopWord.content)
                        with open("data/stopWord.json", "r") as f:
                                stopWord.STOPLIST += f.read().split("\n")
        
        # 去除停用词
        def removeStopword (wordList):
                filteredWords = [word for word in wordList if word not in Reference.STOPWORDLIST]
                return filteredWords
        
        # 职业分类
        def classifyJobs (position):
                result = "others"
                
                pattern1 = re.compile(u'[^\s1234567890:：' + 'a-zA-Z]+')
                position1 = re.sub(pattern1, "", position)
                if len(position1) > 2:
                        result = position1
                else:
                        result = position
                
                jobsDict = Reference.JOBS_TYPE_DICT
                for key in jobsDict:
                        if key in result:
                                result = jobsDict[key]
                                break
                
                if isinstance(result, str) == True:
                        # return 0
                        return 4 # 现在让不知所云者当BA
                return result

class   textVary (object):
        
        def tf_idf_regression (trainL, testL, y_trainL, y_testL):
                
                # Setup tfidf model
                info_train1 = [' '.join(i) for i in trainL]
                info_test1  = [' '.join(i) for i in testL]
                
                tfidf_train = tfidf.fit_transform(counter.fit_transform(info_train1))
                tfidf_test  = tfidf.fit_transform(counter.transform(info_test1))
                
                print(tfidf_train.shape, tfidf_test.shape)
                
                # Train tfidf model
                param_grid = {
                        'C': [0.01, 0.1, 1.0, 2.0, 10, 100], 
                        'penalty' : ['l2']
                        # 'penalty' : ['l1', 'l2']
                }
                
                clf = LogisticRegression()
                grid_search = GridSearchCV (
                        estimator = clf,
                        param_grid = param_grid,
                        scoring = 'accuracy',
                        cv = 5,
                        n_jobs = -1
                )

                grid_search.fit (tfidf_train, y_trainL)
                
                print(grid_search.best_params_)
                print(grid_search.best_score_)
                
                lr_best = LogisticRegression(penalty='l2',C=2)
                lr_best.fit(tfidf_train, y_trainL)
                tf_idf_y_pred = lr_best.predict(tfidf_test)
                # print(tf_idf_y_pred)
        
                print('TF-IDF LR test accuracy %s' % metrics.accuracy_score(y_testL, tf_idf_y_pred))
                print('TF-IDF LR test F1_score %s' % metrics.f1_score(y_testL, tf_idf_y_pred, average="macro"))
                
                return lr_best
        '''
        def word2vec_regression (trainL, testL, y_trainL, y_testL):
                
                model = KeyedVectors.load_word2vec_format('data/sgns.zhihu.word')
                model['']
                vocabulary = model.vocab
                
                vec_lem = model[''].shape[0]
                
                grid_search = GridSearchCV( 
                        estimator = clf,
                        param_grid = param
                )
        '''
        def bert_regression (trainL, testL, y_trainL, y_testL):
                
                # Set-up basic Information
                gpu = 0
                use_cuda = gpu >= 0 and torch.cuda.is_available()
                print(use_cuda)
                
                if use_cuda:
                        torch.cuda.set_device(gpu)
                        device = torch.device("cuda", gpu)
                else:
                        device = torch.device("cpu")
                logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)
                
                bert_model_dir = 'bert-mini'
                tokenizer = BertTokenizer.from_pretrained(bert_model_dir)
                Bertmodel = BertModel.from_pretrained(bert_model_dir)
                
                word = ['今天我是一个大笨蛋']
                input_id = tokenizer(word, padding = True, truncation = True, max_length = 0, return_tensors = 'pt')
                result = Bertmodel(input_id['input_ids'])
                print(result)
                
                vec_len = len(result[0][0][1])
                print(vec_len)
        
def train_Model (model):
        
        
        # Step 01
        ################################################################################
        # 输入信息
        data = inputSource (sourcePath = FILEPATH)
        function.extractDS (targetDS = data)
                
        ProcessData = data[['name', 'infotext', 'jobs']]
        # ProcessData['Type'] = ProcessData['jobs'].apply(lambda x: 0)
        count = 0
        rowSize = len(ProcessData)
        ProcessData.insert(loc = len(ProcessData.columns), column = 'Type', value = [0 for i in range(rowSize)])
        print(ProcessData['infotext'])
        while (count < rowSize):
                ProcessData.loc[count, 'Type'] = function.classifyJobs(ProcessData.loc[count, 'jobs'])
                ProcessData.at[count, 'infotext'] = eval(ProcessData.loc[count, 'infotext'])
                count += 1
        
        # 下面这个是正确的 但是上面的赋值会受到排序不一的干扰 看有没有办法解决
        # ProcessData['infotext'] = ProcessData['infotext'].apply(lambda x: eval(x))
        
        # print(ProcessData)
        # Sorted by job types
        # SortedProcessData = ProcessData.sort_values('Type')
        '''
        print(SortedProcessData)
        for index, row in data.iterrows():
                print(SortedProcessData['jobs'][index] + "  " + str(ProcessData['Type'][index]))
        AdminData = ProcessData[ProcessData.Type == 3]
        print(AdminData['infotext'])
        '''
        
        # Step 02
        ################################################################################
        # 这样我们应该就可以实现分类计算词频了 开始训练 划分训练集和测试集 这些参数可以进行调节
        X_Set = ProcessData['infotext'] # X info
        Y_Set = ProcessData['Type']     # Y type
        test_ratio = 0.2
        
        x_train, x_test, y_train, y_test = train_test_split (X_Set, Y_Set, test_size = test_ratio, random_state = 0)
        '''
        print("See Results\n")
        print(x_train.head(), y_train.head())
        '''
        if (model == "tfidf"):
                fn = textVary.tf_idf_regression (trainL = x_train, testL = x_test, y_trainL = y_train, y_testL = y_test)
                f  = open('models/tfidf_model1.pkl', 'wb')
                pickle.dump(fn, f)
                f.close()
        
        if (model == "bert"):
                fn = textVary.bert_regression (trainL = x_train, testL = x_test, y_trainL = y_train, y_testL = y_test)
                '''
                f  = open('models/bert_model1.pkl', 'wb')
                pickle.dump(fn, f)
                f.close()
                '''
        '''
        fn = textVary.tf_idf_regression (trainL = x_train, testL = x_test, y_trainL = y_train, y_testL = y_test)
        return fn
        '''
        
def recommend_Resume (targetPDF):
        
        targetFile = pdfString.Transformer(file_dir = targetPDF, quitD = 1).info
        targetInfo = preExtractor(sourceText = targetFile["Text"], filename = targetPDF).info
        targetText = eval(str(targetInfo["infotext"]))
        
        # 训练模型
        train_Model(model = "tfidf")
        # train_Model(model = "bert")
        
        # 调用模型
        f  = open('models/tfidf_model1.pkl', 'rb')
        fn = pickle.load(f)
        f.close()
        
        try1 = [' '.join(targetText)]
        # print(try1)
        # print("True: " + str(ProcessData.loc['Type']) + "\n" + ProcessData.loc[num, 'jobs'])
        # print("True: " + function.classifyJobs(targetInfo['jobs']) + "\n" + targetInfo['jobs'])

        tfidf_try1 = tfidf.fit_transform(counter.transform(try1))
        try1_pred  = fn.predict(tfidf_try1)
        
        print("PREDICT: " + str(try1_pred) + ": " + Reference.JOB_RECOMMENDATION[try1_pred[0]])
        ################################################################################
        # tf_idf 向量化
        
        
# Main Function
if __name__ == "__main__":
                
        # 测试一下这个训练结果
        targetPDF = "Kenny.pdf"
        recommend_Resume (targetPDF = targetPDF)
        ansinfo = Try02.Extractor(file_dir = targetPDF).search()
        Try02.Generator(sourceInfo = ansinfo).display()

# coding:utf-8
# 目前还缺乏研究 如果有多个专业应该怎么处理
# 多种方式比对

# 信息 先分块 后解析 准确率和效率提升

import os
import re
from   xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb             
import sys        
import datetime
import pyDataverse as pd
import json
import sys
# import provinces

################################################################################################
# PowerBi dataverse
BASE_URL  = "https://globaldisco.crm5.dynamics.com/api/discovery/v2.0/Instances"
API_TOKEN = "https://org61624faf.api.crm5.dynamics.com/api/data/v9.2"

# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\CV_Automation\ResumeRespo"  

PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构

################################################################################################
# 参考集 字典

# 个人筛选的 200 个常用姓氏 生成字典
Surname_List = [
        '赵','钱','孙','李','周','吴','郑','王','冯','陈','褚','卫','蒋','沈','韩','杨','朱','秦','尤','许','何','吕','施','张','孔','曹','严','金','魏','陶','姜','戚','谢','邹','苏','潘','葛','奚','范','彭','郎','鲁','韦','昌','马','苗','方','俞','任','袁','柳','酆','鲍','史','唐','费','廉','岑','薛','雷','贺','倪','汤','滕','殷','罗','毕','郝','邬','安','常','乐','于','时','傅','皮','齐','康','余','卜','顾','孟','平','黄','穆','萧','尹','姚','邵','汪','祁','毛','狄','米','贝','明','臧','成','戴','宋','茅','庞','熊','纪','舒','屈','项','祝','董','梁','杜','阮','蓝','闵','席','季','麻','贾','路','娄','危','江','童','颜','郭','梅','盛','林','徐','邱','骆','高','夏','蔡','田','樊','胡','凌','霍','虞','万','柯','管','卢','莫','房','丁','宣','邓','郁','单','杭','洪','包','诸','石','崔','吉','钮','龚','程','嵇','邢','裴','陆','翁','芮','靳','松','井','段','富','焦','巴','谷','车','全','郗','班','秋','仲','伊','宁','仇','栾','甘','祖','武','符','刘','景','詹','龙','叶','幸','韶','黎','溥','庄','白'
        ]

Surname_Dict = dict(zip(Surname_List, range(len(Surname_List)))) # 字典: {'赵':0,'钱':1,'孙':2,'李':3, ...}
# 专业
Major_List = [
        '软件工程','计算机软件','计算机硬件','互联网','通信','电信','网络资源','计算机科学与技术'
        ]

# 技能
Skillset_List = [
        'Java', 'C', 'WEB', 'SQL', 'EJB', 'Cpp', 'C#', 'dotnet', 'RPA', 'Python', 'HTML', 'Html', 'CSS', 'JavaScript', 'R', '外语', 'Office', '项目'
        ]

# 地点
Location_List = [
        '成都', '广州'
        ]

# 来源
Vendor_List = [
        '猎聘', '智联', '前程', '领英', '51'
        ]

################################################################################################
# 子函数
################################################################################################
# 抽取器 抽取单个文件的信息
class Extractor (object):  
        # 读取文件目录
        def __init__ (self, file_dir):
                
                self.fullWord = []
                self.fullText = ""
                self.file_dir = file_dir 
                
                if os.path.splitext(self.file_dir)[1] == ".pdf":
                        pdf = pb.open(self.file_dir)
                
                for page in pdf.pages:
                        self.fullWord += page.extract_words()
                        self.fullText += page.extract_text() if page.extract_text() else ""
                        
                pdf.close()
        
        # 功能函数 读取一个段落知道某一行的长度只有不到4位中文字符
        def __readUntil (text, length):
                return ""
        
        # 必要部分：姓名 应聘职位 专业 联系电话 附件下载 来源 性别
        # 01 搜索姓名函数  Name
        def __search_Name (self):
                
                result = ""
                names = []
                full_text = self.fullText
                
                # 查看是否在文件名下 但是3位容易出现 4位名字扫不到 反之 出现李强简历之类的 
                dir_Set = re.findall(r"[\u4e00-\u9fa5]{2,3}", ((self.file_dir).split("\\"))[-1] )
                if (len(dir_Set) > 0):
                        for TempDir in dir_Set: 
                                if (TempDir[0] in Surname_List):
                                        return TempDir

                # 查看是否在姓名字段下 一般认为出现在前十五行 所以设置count遍历
                for line in full_text.split("\n"):

                        # 是否在姓名字段下
                        if re.search(r"姓[ ]+名", line):
                                name = re.findall(r"姓[ ]+名[ :\\n]+[\u4e00-\u9fa5]{2,4}", line)[0]
                                names.append(re.sub(r"[姓名:：\s]", "", name))
                                break
                        
                        # 没有姓名字段 则分解该行 看看是不是有带有合适的姓氏的中文词汇
                        else:   
                                regex_str = "[" + "|".join(Surname_List) +"]" +r'[\u4e00-\u9fa5]{1,3}'
                                nameset = re.findall (regex_str, line)
                                if len(nameset): return nameset[0]
                                names += nameset                     

                # 筛选好 names 嫌疑集合 对 names 集合内的元素鉴定是否有姓氏 返回有姓氏的那个
                for TmpName in names: 
                        if (TmpName[0] in Surname_List):
                                result = TmpName
                                return result   
                return result
        
        # 02 搜索应聘职位  Jobs
        def __search_Jobs (self):
                
                result = ""
                jobs = []
                full_text = self.fullText
                JobTitle_List = ["期望职位", "应聘职位", "期望从事职位"]
                
                for line in full_text.split("\n"):
                        # 是否在职位字段下
                        if any(title in line for title in JobTitle_List):
                                for title in JobTitle_List:
                                        if re.search(title, line):
                                                job_List = re.findall(r"\s*" + title + "[:：\s]*[a-z|A-Z|0-9|\u4e00-\u9fa5]{2,14}", line)
                                                if (len(job_List) > 0): 
                                                        job = job_List[0]
                                                        job = re.sub(title + r"[:：*\s]", "", job)
                                                        job = re.sub("\s", "", job)
                                                        jobs.append(job)
                                                        return job
                                                        break
                return ""
        
        # 03 搜索专业函数  Major
        def __search_Major (self):
                majors = []
                major = ""  
                result = ""
                full_text   = self.fullText
                full_words  = self.fullWord
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):
                        
                        # 51 job        
                        if re.search(r"专[ ]+业*", line):
                                majorList = re.findall(r"专[ ]+业[:：\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[专业:：\s]", "", major))
                        
                        # 猎聘通
                        if re.search(r"\s*行[ ]+业*", line):
                                majorList = re.findall(r"\s*行[ ]+业[:：\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[行业:：\s]", "", major))
                        
                        for premajor in Major_List:
                                if premajor in line:
                                        return premajor
                
                if (len(majors) > 0): 
                        if (len(majors[0]) > 0):
                                return majors[0]
                
                # 在正文部分中寻找 带有专业或者系的字段
                for word in full_words:
                        
                        text = ""
                        textMajor = ""
                        if os.path.splitext(self.file_dir)[1] == ".pdf": text = word["text"]
                        else: text = word
                        
                        # 中文专业 尴尬的事情是扫码联系
                        if "专业" or "系" or "技术" in text:
                                for m in re.findall(r"[\u4e00-\u9fa5]{2,10}?(?:专业|系|技术)", text):
                                        if "专业" or "系" or "技术" in m:
                                                majors.append(m)
                                                textMajor = m;
                                                break
                                if textMajor != "": break
                        
                        # 英文专业 这一部分还需要修改
                        elif "Bsc" or "Major" or "Msc" in text:
                                for m in re.findall(r"[a-Z]{2,5}?(?:(Bsc)|(Msc)|Major)", text):
                                        if "Bsc" or "Major" or "Msc" in m:
                                                majors.append(m)
                                                textMajor = m;
                                                break
                                if textMajor != "": break
                                
                if len(majors) > 0: 
                        for m in majors:
                                if (len(m) == 0): continue
                                result = m
                return result  
        
        # 04 搜索电话信息  
                # Area Code and Telephone 暂时没有想到这里该怎么做 带有区号的和不带区号的 还有 Tail 要研究一下
        def __search_Phone (self):
                # 找到含有11位数字的字符串段
                full_text = self.fullText
                phone   = ""
                number  = ""
                number_List = []
                             
                # 通过关键词查找  去除空格和短横线后 前后的小括号 读取 11 13 14 个连续的数字
                for line in full_text.split("\n"):
                        if re.search(r"电\s*话", line) or re.search(r"手\s*机", line):
                                # 去除标点符号
                                line = re.sub(r"[()（）：:+\-]", "", line)
                                # 选择 11 到 15 位长度的数字
                                number_List = re.findall(r"\d{11,15}", line)
                                
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number                    
                                break
                        
                # 直接通过数字长度查找 返回符合要求的集合
                        if phone == "":
                                text   = re.sub(r"[()（）+\-]", "", full_text)
                                phones = re.findall(r"\d{11,15}", text)
                                phone  = ",".join(set(phones))
                return phone
        
        # 06 确认来源信息  Vendor
        def __search_Vendor (self):
                
                directory = self.file_dir
                full_text = self.fullText
                
                # 在目录中寻找
                for vendor in Vendor_List:
                        if vendor in self.file_dir:
                                return vendor

                # 在字段中寻找
                count = 0
                for line in full_text.split("\n"):
                        if (count > 20): break
                        for vendor in Vendor_List:
                                if vendor in line: return vendor
                        count += 1

                return ""
        
        # 07 搜索性别函数  Gender 没写男女就只能通过照片去判断
        def __search_Gender (self):
                gender = "" 
                full_text  = self.fullText
                full_words  = self.fullWord
                counter = 0
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 15): break

                        # 性别字段
                        if re.search(r"性[ ]+别*", line):
                                gender = re.findall(r"性[ ]+别[:：\s]*[\u4e00-\u9fa5]{2,10}", line)[0]
                        
                        # 识别到男性字段
                        if re.search(r"男", line) or re.search(r"Male", line): 
                                gender = "男"
                                return gender
                        
                        # 识别到女性字段
                        if re.search(r"女", line) or re.search(r"Female", line): 
                                gender = "女"
                                return gender
                        
                        counter += 1                                
                return gender
        
        # 可选部分: 
        # 08 搜索年龄函数  Age
        def __search_Age (self):
                
                Curr_Year = datetime.datetime.now().year
                number = ""
                full_text  = self.fullText
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        
                        # 获取出生年月
                        if re.search(r"出生年月", line):
                                number_List = re.findall(r"\d{4,4}", line)
                                if (len(number_List) > 0): number = number_List[0]
                                Age = Curr_Year - int(number)
                                return str(Age) 
                                break
                        
                        # 获取岁
                        if re.search(r"\s*岁", line):
                                number_List = re.findall(r"\d{1,2}", line)
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number  
                                break
                        
                return ""
        
        # 09 判断在职状态  Condition
        def __search_Condition (self):
                full_text  = self.fullText
                counter = 0
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 20): break
                        if re.search(r"离职", line): return "离职"
                        if re.search(r"正在找工作", line): return "正在找工作"
                        if re.search(r"在职", line): return "在职"
                        counter += 1                                
                return ""
        
        # 10 搜索城市函数  Cities
        def __search_City (self):
                
                locations = []
                location = ""  
                full_text  = self.fullText
                
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        
                        if re.search(r"\s*地点", line):
                                
                                loc_List = re.findall(r"\s*地点[:：\s]*[\u4e00-\u9fa5]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地点:：\s]", "", location))
                                break
                        
                        if re.search(r"所在地", line) or re.search(r"现居地", line):
                                
                                loc_List = re.findall(r"\s*地[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地:：\s]", "", location))
                                break
                        
                        if re.search(r"住\s*址", line) or re.search(r"现居住", line)  or re.search(r"Location", line):
                                # 住址
                                loc_List = re.findall(r"住\s*址[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[住址:：\s]", "", location))
                                        break
                                        
                                # 现居住
                                loc_List = re.findall(r"现居住[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[现居住:：\s]", "", location))
                                        break
                                
                                # Location
                                loc_List = re.findall(r"Location[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[Location:：\s]", "", location))
                                        break
                                break
                        
                if (len(locations) > 0): location = locations[0]
                return location
        
        # 11 搜索学历函数  Stage
        def __search_Stage (self):
                
                stage = ""  
                full_text  = self.fullText
                
                setPhd = ["博士"]
                setMsc = ["硕士", "研究生"]
                setBsc = ["大学", "本科"]
                setByd = ["大专", "专科"]
                setOth = ["学院"]
                setSta = setPhd + setMsc + setBsc + setByd + setOth
              
                # 在学历字段中寻找  
                for line in full_text.split("\n"):
                        
                        if (any (TempStr in line for TempStr in setSta)):
                                
                                if (any (TempStr in line for TempStr in setPhd)): stage =  "博士"
                                if (any (TempStr in line for TempStr in setMsc)): stage =  "硕士"
                                if (any (TempStr in line for TempStr in setBsc)): stage =  "本科"
                                if stage != "": return stage
                        
                if stage == "": return "专科"                    
                return stage
        
        # 12 搜索籍贯函数  Hometown
        def __search_Hometown (self):
                hometown    = "" 
                full_text   = self.fullText
                full_words  = self.fullWord
                counter = 0
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 15): break

                        # 籍贯字段
                        if re.search(r"籍[ ]+贯*", line):
                                hometown = re.findall(r"籍[ ]+贯[:：\s]*[\u4e00-\u9fa5]{2,10}", line)[0] 
                return hometown
                
        # 13 搜索自我评价函数  Self-Comment
        def __search_SelfComment (self):
                selfie    = "" 
                '''
                full_text = self.fullWord
                counter = 0
                
                print(full_text)
                # 在专业字段中寻找  
                turn = False
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        # if (counter < 10): continue

                        # 籍贯字段
                        if re.search  (r"自我评价", line):
                                turn = True
                                print ("Yes" + self.file_dir)
                        
                        if (turn == True) and (len(line) > 10):
                                print (line + "\n")
                '''
                return ""
        
        # 14 搜索工作经验函数   Working Experience
        def __search_WorkExperience (self):
                return ""
        
        # 15 搜索教育经历函数   Education Experience
        def __search_EducationExperience (self):
                return ""
        
        # 16 搜索学校函数       School
        def __search_School (self):
                # 这个顺序有讲究的 一般 大学 校区 学院
                College_signs = ["大学", "校区", "学院"]
                Note_signs = ["毕业院校"]
                school = ""  
                school_list = []
                full_text  = self.fullText
                punctuation = ':：|-'

                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 查看是否有相匹配的节点
                        for term in Note_signs:
                                if re.search(term, line):
                                        school_list += re.findall(r"[:：\s]*[\u4e00-\u9fa5]{2,10}", line)
                                        
                        # 看看这行有无关键词 有就加入 用\S避免字符不能识别 先把标点符号替换以区分
                        for term in College_signs:
                                if re.search(r"\s*"+term, line):
                                        line = re.sub('[{}]'.format(punctuation), " ", line)
                                        school_list += re.findall(r"\S{2,10}"+term, line)
                                        
                        # 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大
                        if (len(school_list) > 0):
                                school = re.sub(r"\s", "", school_list[-1])
                                return school
                                break
                
                return ""
        
        # 17 搜索证书函数       Certificate
        def __search_Certificate (self):
                return ""

        # 18 搜索专业技能函数   Skill-Set
        def __search_ProfessionalSkills (self):
                return ""
        
        # 19 搜索期望薪资函数   Expected Salaries
        def __search_Salary (self):

                salary = ""
                Note_signs = ["期望薪资"]
                salary_list = []
                full_text  = self.fullText
                punctuation = ':：|-'

                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 查看是否有相匹配的节点
                        for term in Note_signs:
                                if re.search(term, line):
                                        school_list += re.findall(r"[:：\s]*\S{2,10}", line)
                                        
                        # 关键在 - 左右两边对称 多少到多少
                        if re.search("/月", line):
                                print(line)
                                salary_list += re.findall(r"[0-9\.\s 万]{1,10}-[0-9\.\s 万]{1,10}", line)
                                        
                        # 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大
                        if (len(salary_list) > 0):
                                salary = re.sub("万", "0000", salary_list[-1])
                                salary = re.sub(r"[\s ]", "", salary_list[-1])
                                return salary
                                break
                return ""
        
        # 20 搜索工作年限函数   Working Stages
        def __search_WorkYears (self):
                return ""
        
        # 21 搜索区号函数
        
        # 22 搜索邮箱函数  Email
        def __search_Email (self):
                # 找到含有 @ 和 . 的字符串段
                full_words = self.fullWord
                full_text  = self.fullText
                email = ""
                email_List = []
                newEmail = ""
                
                # 先查看邮箱栏下是否有邮箱可以直接选用
                for line in full_text.split("\n"):
                        
                        if re.search(r"邮[ ]+箱", line):
                                newEmail = re.findall(r"[a-zA-Z0-9_\-.@]+", line)[0]
                                email_List.append(re.sub(r"[邮箱:：\s]", "", newEmail))
                
                if (len(email_List) > 0):
                        for TempEmail in email_List:
                                if '@' in TempEmail:
                                        email = email_List[0] 
                                        return email
                
                # 再遍历所有的 word 寻找邮箱特殊的关键词
                for word in full_words:
                        if os.path.splitext(self.file_dir)[1] == ".pdf":
                                text = word["text"]
                        else:
                                text = word
                        if "@" in text and "." in text:
                                for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):
                                        if "@" in e:
                                                email = e
                                                break
                                if email != "": break
                return email

        # 搜索技能函数  Search Skills
        def __search_Skill (self):
                
                Skills = []
                skill  = ""
                full_text  = self.fullText
                
                for line in full_text.split("\n"):
                        key = ""
                        for keyword in Skillset_List:            
                                if re.search(keyword, line) and (key == ""):
                                        Skills.append(line)
                                        key = "Added"
                
                return Skills
        
        # 入口函数 返回搜索结果
        def search (self):
                # 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个
                sep_dir = re.split(r"/+|\\+", self.file_dir)
                directory = sep_dir[-1]
                file_name = sep_dir[-1]
                
                if len(sep_dir) > 1:
                        directory = sep_dir[-2]
                
                info = {
                        "Directory": directory, "file_name": file_name, "user_name": "", "email": "", "phone": "", "gender": "", "stage": "", "major": "", "age": "", "city": "", "skill": "", "jobs": "", "vendor": "", "condition": "", "hometown": "", "school": "", "salary": "", "selfComment": ""
                        }
                
                func = {
                        "user_name":    self.__search_Name(),           # 姓名
                        "jobs":         self.__search_Jobs(),           # 应聘职位
                        "major":        self.__search_Major(),          # 专业
                        "phone":        self.__search_Phone(),          # 电话
                        5:              directory,                      # 附件
                        "vendor":       self.__search_Vendor(),         # 来源
                        "gender":       self.__search_Gender(),         # 性别
                        "age":          self.__search_Age(),            # 年龄
                        "condition":    self.__search_Condition(),      # 状态
                        "city":         self.__search_City(),           # 现居地
                        "stage":        self.__search_Stage(),          # 学历
                        "hometown":     self.__search_Hometown(),       # 籍贯
                        "selfComment":  self.__search_SelfComment(),    # 自我评价
                        14:     "",
                        15:     "",
                        "school":       self.__search_School(),         # 学校
                        17:     "",
                        18:     "",
                        "salary":       self.__search_Salary(),
                        20:     "",
                        21:     "",
                        "email":        self.__search_Email(),          # 邮箱
                        23:     "",
                        "skill":        self.__search_Skill(),          # 技能
                }

                for key in info:
                        if (key == "Directory") or (key == "file_name"): continue
                        
                        try:    
                                info[key] = func[key]
                        except Exception as e: 
                                print(e)
                                continue
                
                return info

################################################################################################
# 猎聘
# class Lie-Pin (object):

################################################################################################
# 智联
# class Zhi-Lian (object):
        
################################################################################################
# 前程无忧
# class Qian-Cheng (object):

################################################################################################
# 51jobs
# class Jobs (object):

################################################################################################
# 遍历并读取函数
class Reader (object):
        # 初始化
        def __init__ (self, folder_Path):
                self.path = folder_Path
        
        # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
        def read (self, type):
                ResumePath = []
                allfilelist = os.listdir(self.path)
                
                for file in allfilelist:
                        # 生成简历文件路径 判断是否位文件
                        filepath = os.path.join(FolderPath, file)
                        if os.path.isfile(filepath):
                                # 遍历所有符合type类型的简历
                                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
                                        ResumePath.append(filepath)
                                        filename.append(file)
                return ResumePath

################################################################################################
# 输出生成函数
class Generator (object):
        # 初始化
        def __init__ (self, sourceInfo):
                self.info = sourceInfo
        
        # 打印呈现
        def display (self):
                
                result = self.info
                print("################### Candidate ###################")
                
                # Necessary info
                print("Name     : ", result["user_name"])
                print("Position ：", result["jobs"])
                print("Major    : ", result["major"])
                print("Phone    : ", result["phone"])
                print("Gender   : ", result["gender"])
                print("Source   : ", result["file_name"])
                print("Vendor   : ", result["vendor"])
                print("Condition: ", result["condition"])
                
                # Optional Info
                print("Email    : ", result["email"])
                print("City     : ", result["city"])
                print("Age      : ", result["age"])
                print("Stage    : ", result["stage"])  
                print("Hometown : ", result["hometown"])
                print("School   : ", result["school"])
                print("Salary   : ", result["salary"])
                
                # print("SkillSet : ", "\n".join(result["skill"]))
                print("\n\n\n")
                
        # 生成 Json
        def generate_Json (self):
                try:
                        data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)
                except Exception as e: print(e)
                return data_Json

################################################################################################
# Json形式下的简历信息发布至 dataverse (Power BI)
class dataverse_Publish (object):
        # 初始化
        def __init__ (self, sourceJson):
                self.source = sourceJson
        
        # 主要函数
        def process (self):
                
                sourceFile = "TestJson.json"
                
                # 链接 api 接口
                from pyDataverse.api import NativeApi
                api = NativeApi(BASE_URL, API_TOKEN)
                
                # Create Collection of data
                from pyDataverse.models import Dataverse
                from pyDataverse.utils import read_file
                dv = Dataverse()
                dv.from_json(read_file(sourceFile))
                
                resp = api.create_dataverse (":root", dv.json())
                resp = api.publish_dataverse ("Dataverse_Resumes")
                resp = api.get_dataverse ("Dataverse_Resumes")

################################################################################################
# 杂项函数
class function:
        # 呈现百分比
        def displayPercent (counter, total, turn):
                
                assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))

                # 常规 display, turn == True
                if (turn):
                        percent = float(counter)*100 / float(total)
                        sys.stdout.write("%.4f"%percent);
                        sys.stdout.write("%\r");
                        sys.stdout.flush();
                # 最终 display, turn == False
                else:
                        sys.stdout.write("100%!finish!\n");
                        sys.stdout.flush();
                return ""
                
        # Json 初始化
        def initiateJson (filename):
                
                assert (isinstance(filename, str) and (".json" in filename))
                Json_file = open(filename, 'w', encoding = 'utf-8')
                Json_file.seek(0)       # 定位到 Position 0
                Json_file.truncate()    # 清空 Json 文件
                return Json_file

################################################################################################
# 主函数
if __name__ == "__main__":
        
        # Step 1: 遍历该文件夹下的所有简历文件        
        PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
        DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
        ResumeInfoList = []
        
        # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
        counter   = 0
        Json_file = function.initiateJson ("resume_Result.json")
        total     = len(PdfResumePath)
        
        #         导出简历信息
        for file in PdfResumePath:
                counter = counter + 1
                # if (counter > 2): continue
                ResumeInfoList.append (Extractor(file_dir = file).search())
                function.displayPercent (counter, total, True)
        function.displayPercent (counter, total, False)
        counter = 0
        
        Json_file.write("[\n")
        length = len(ResumeInfoList)
        #         将信息呈现并写入json
        for info in ResumeInfoList:
                counter = counter + 1
                Generator(sourceInfo = info).display()
                Result_Json = Generator(sourceInfo = info).generate_Json()
                Json_file.write(Result_Json)
                if (counter != length): Json_file.write(",")
                Json_file.write("\n")
        Json_file.write("]")
        Json_file.close()
        
        # 复制到仓库中
        # Step 3: 导出到 dataverse
        # dataverse_Publish(sourceJson = Json_filename).process()
        

# https://orgd9c1d674.api.crm5.dynamics.com/api/data/v9.2
# https://org61624faf.api.crm5.dynamics.com/api/data/v9.2

################################################################################################
# 函数 读取信息             
# print (ResumePath[0])
# xingming_node = document_tree.getElementsByTagName("XingMing")[0]
# xingming = xingming_node.childNodes[0].data
                
################################################################################################
# 函数 将一份简历信息写入 Excel 文件
# print (ResumePath)
# print (filename)

# coding: utf-8

# 结果导出到 result.json文件夹内

import os
from   xml.dom.minidom import parse
import pdfplumber as pb             
import sys        
import datetime
import pyDataverse as pd
import json
import sys

PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录

# 转换器
class Transformer (object):
    
    def __init__ (self, file_dir, quitD):
        
        self.fulltext = ""
        self.fileDir = file_dir
        
        # 打开pdf文件
        if os.path.splitext(self.fileDir)[1] == ".pdf":
            pdf = pb.open(self.fileDir)
        
        # 从每一页中读取出需要的内容
        for page in pdf.pages:
            self.fulltext += page.extract_text() if page.extract_text() else ""
        pdf.close()
        
        # 去重汉字
        if (quitD == 1):
            self.fulltext = self.quitDuplicate()
        
        self.info = {"File_name": self.fileDir, "Text": self.fulltext}
        
    def quitDuplicate (self):
        full_text = self.fulltext
        new_text = ""
        
        for line in full_text.split("\n"):
            newline = function.quitDuplicate(line) + '\n'
            new_text += newline
        
        return new_text
        
    def generate_Json (self):
            try:
                    data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)
            except Exception as e: print(e)
            return data_Json

# 功能函数
class function:
    
    # 呈现百分比
    def displayPercent (counter, total, turn):
                
        assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))

        # 常规 display, turn == True
        if (turn):
            percent = float(counter)*100 / float(total)
            sys.stdout.write("%.4f"%percent);
            sys.stdout.write("%\r");
            sys.stdout.flush();
        # 最终 display, turn == False
        else:
            sys.stdout.write("100%!finish!\n");
            sys.stdout.flush();
        return ""
    
    # Initialize Json file
    def initiateJson (filename):
            
        assert (isinstance(filename, str) and (".json" in filename))
        Json_file = open(filename, 'w', encoding = 'utf-8')
        Json_file.seek(0)       # 定位到 Position 0
        Json_file.truncate()    # 清空 Json 文件
        return Json_file

    # 去除重复的字符
    def quitDuplicate (source):
        # return source
        counter = 1
        while (counter < len(source)):
            if (source[counter] == source[counter - 1]):
                # 额外需要增加的功能 是否是名字的判断
                if '\u4e00' <= source[counter] <= '\u9fff':
                    oldstr = source
                    newstr = oldstr[:counter] + "" + oldstr[counter + 1:]
                    source = newstr
            counter = counter + 1
        return source
        

# 遍历并读取函数
class Reader (object):
    # 初始化
    def __init__ (self, folder_Path):
        self.path = folder_Path
        
    # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
    def read (self, type):
        ResumePath = []
        allfilelist = os.listdir(self.path)
                
        for file in allfilelist:
            # 生成简历文件路径 判断是否位文件
            filepath = os.path.join(FolderPath, file)
            if os.path.isfile(filepath):
                # 遍历所有符合type类型的简历
                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
                    ResumePath.append(filepath)
                    filename.append(file)
        
        return ResumePath

# main function
def operater():
    
    FolderPath = r"respo"  
    quitD = int(input('Quit Duplicate?, if yes input 1\n'))
        
    # Step 1: 遍历该文件夹下的所有简历文件        
    PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
    DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
    
    # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
    counter   = 0
    Json_file = function.initiateJson ("Result.json")
    total     = len(PdfResumePath)
    
    # 导出简历信息
    for file in PdfResumePath:
        
        counter = counter + 1
        
        # Write in files
        Result_Json = Transformer(file_dir = file, quitD = quitD).generate_Json()
        Json_file.write (Result_Json + "\n")
        
        # Show percentage display
        function.displayPercent (counter, total, True)
    
    Json_file.close()
    function.displayPercent (counter, total, False)
    counter = 0

# coding: utf-8

# 结果导出到 result.json文件夹内

import os
from   xml.dom.minidom import parse
import pdfplumber as pb             
import sys        
import datetime
import pyDataverse as pd
import json
import sys

PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录

# 转换器
class Transformer (object):
    
    def __init__ (self, file_dir, quitD):
        
        self.fulltext = ""
        self.fileDir = file_dir
        
        # 打开pdf文件
        if os.path.splitext(self.fileDir)[1] == ".pdf":
            pdf = pb.open(self.fileDir)
        
        # 从每一页中读取出需要的内容
        for page in pdf.pages:
            self.fulltext += page.extract_text() if page.extract_text() else ""
        pdf.close()
        
        # 去重汉字
        if (quitD == 1):
            self.fulltext = self.quitDuplicate()
        
        self.info = {"File_name": self.fileDir, "Text": self.fulltext}
        
    def quitDuplicate (self):
        full_text = self.fulltext
        new_text = ""
        
        for line in full_text.split("\n"):
            newline = function.quitDuplicate(line) + '\n'
            new_text += newline
        
        return new_text
        
    def generate_Json (self):
            try:
                    data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)
            except Exception as e: print(e)
            return data_Json

# 功能函数
class function:
    
    # 呈现百分比
    def displayPercent (counter, total, turn):
                
        assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))

        # 常规 display, turn == True
        if (turn):
            percent = float(counter)*100 / float(total)
            sys.stdout.write("%.4f"%percent);
            sys.stdout.write("%\r");
            sys.stdout.flush();
        # 最终 display, turn == False
        else:
            sys.stdout.write("100%!finish!\n");
            sys.stdout.flush();
        return ""
    
    # Initialize Json file
    def initiateJson (filename):
            
        assert (isinstance(filename, str) and (".json" in filename))
        Json_file = open(filename, 'w', encoding = 'utf-8')
        Json_file.seek(0)       # 定位到 Position 0
        Json_file.truncate()    # 清空 Json 文件
        return Json_file

    # 去除重复的字符
    def quitDuplicate (source):
        # return source
        counter = 1
        while (counter < len(source)):
            if (source[counter] == source[counter - 1]):
                # 额外需要增加的功能 是否是名字的判断
                if '\u4e00' <= source[counter] <= '\u9fff':
                    oldstr = source
                    newstr = oldstr[:counter] + "" + oldstr[counter + 1:]
                    source = newstr
            counter = counter + 1
        return source
        

# 遍历并读取函数
class Reader (object):
    # 初始化
    def __init__ (self, folder_Path):
        self.path = folder_Path
        
    # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
    def read (self, type):
        ResumePath = []
        allfilelist = os.listdir(self.path)
                
        for file in allfilelist:
            # 生成简历文件路径 判断是否位文件
            filepath = os.path.join(FolderPath, file)
            if os.path.isfile(filepath):
                # 遍历所有符合type类型的简历
                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
                    ResumePath.append(filepath)
                    filename.append(file)
        
        return ResumePath

# main function
def operater():
    
    FolderPath = r"respo"  
    quitD = int(input('Quit Duplicate?, if yes input 1\n'))
        
    # Step 1: 遍历该文件夹下的所有简历文件        
    PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
    DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
    
    # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
    counter   = 0
    Json_file = function.initiateJson ("Result.json")
    total     = len(PdfResumePath)
    
    # 导出简历信息
    for file in PdfResumePath:
        
        counter = counter + 1
        
        # Write in files
        Result_Json = Transformer(file_dir = file, quitD = quitD).generate_Json()
        Json_file.write (Result_Json + "\n")
        
        # Show percentage display
        function.displayPercent (counter, total, True)
    
    Json_file.close()
    function.displayPercent (counter, total, False)
    counter = 0

#!/usr/bin/env python
# encoding: utf-8

# -*- coding: utf-8 -*-
# @contact: ybsdeyx@foxmail.com
# @software: PyCharm
# @time: 2019/4/25 16:39
# @author: Paulson●Wier
# @file: captcha_qq.py
# @desc:
import numpy as np
import random

import requests
from selenium.webdriver import ActionChains
import time
from selenium import webdriver
from PIL import Image
import os
from selenium.webdriver.support.ui import WebDriverWait
import cv2


class Login(object):
    """
    腾讯防水墙滑动验证码破解
    使用OpenCV库
    成功率大概90%左右：在实际应用中，登录后可判断当前页面是否有登录成功才会出现的信息：比如用户名等。循环
    https://open.captcha.qq.com/online.html
    破解 腾讯滑动验证码
    腾讯防水墙
    python + seleniuum + cv2
    """
    def __init__(self):
        # 如果是实际应用中，可在此处账号和密码
        self.url = "https://open.captcha.qq.com/online.html"
        self.driver = webdriver.Chrome(r"C:/Users/E112434/Downloads/chromedriver.exe")

    @staticmethod
    def show(name):
        cv2.imshow('Show', name)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

    @staticmethod
    def webdriverwait_send_keys(dri, element, value):
        """
        显示等待输入
        :param dri: driver
        :param element:
        :param value:
        :return:
        """
        WebDriverWait(dri, 10, 5).until(lambda dr: element).send_keys(value)

    @staticmethod
    def webdriverwait_click(dri, element):
        """
        显示等待 click
        :param dri: driver
        :param element:
        :return:
        """
        WebDriverWait(dri, 10, 5).until(lambda dr: element).click()

    @staticmethod
    def get_postion(chunk, canves):
        """
        判断缺口位置
        :param chunk: 缺口图片是原图
        :param canves:
        :return: 位置 x, y
        """
        otemp = chunk
        oblk = canves
        target = cv2.imread(otemp, 0)
        template = cv2.imread(oblk, 0)
        # w, h = target.shape[::-1]
        temp = 'temp.jpg'
        targ = 'targ.jpg'
        cv2.imwrite(temp, template)
        cv2.imwrite(targ, target)
        target = cv2.imread(targ)
        target = cv2.cvtColor(target, cv2.COLOR_BGR2GRAY)
        target = abs(255 - target)
        cv2.imwrite(targ, target)
        target = cv2.imread(targ)
        template = cv2.imread(temp)
        result = cv2.matchTemplate(target, template, cv2.TM_CCOEFF_NORMED)
        x, y = np.unravel_index(result.argmax(), result.shape)
        return x, y
        # # 展示圈出来的区域
        # cv2.rectangle(template, (y, x), (y + w, x + h), (7, 249, 151), 2)
        # cv2.imwrite("yuantu.jpg", template)
        # show(template)

    @staticmethod
    def get_track(distance):
        """
        模拟轨迹 假装是人在操作
        :param distance:
        :return:
        """
        # 初速度
        v = 0
        # 单位时间为0.2s来统计轨迹，轨迹即0.2内的位移
        t = 0.2
        # 位移/轨迹列表，列表内的一个元素代表0.2s的位移
        tracks = []
        # 当前的位移
        current = 0
        # 到达mid值开始减速
        mid = distance * 7 / 8

        distance += 10  # 先滑过一点，最后再反着滑动回来
        # a = random.randint(1,3)
        while current < distance:
            if current < mid:
                # 加速度越小，单位时间的位移越小,模拟的轨迹就越多越详细
                a = random.randint(2, 4)  # 加速运动
            else:
                a = -random.randint(3, 5)  # 减速运动

            # 初速度
            v0 = v
            # 0.2秒时间内的位移
            s = v0 * t + 0.5 * a * (t ** 2)
            # 当前的位置
            current += s
            # 添加到轨迹列表
            tracks.append(round(s))

            # 速度已经达到v,该速度作为下次的初速度
            v = v0 + a * t

        # 反着滑动到大概准确位置
        for i in range(4):
            tracks.append(-random.randint(2, 3))
        for i in range(4):
            tracks.append(-random.randint(1, 3))
        return tracks

    @staticmethod
    def urllib_download(imgurl, imgsavepath):
        """
        下载图片
        :param imgurl: 图片url
        :param imgsavepath: 存放地址
        :return:
        """
        from urllib.request import urlretrieve
        urlretrieve(imgurl, imgsavepath)

    def after_quit(self):
        """
        关闭浏览器
        :return:
        """
        self.driver.quit()

    def login_main(self):
        # ssl._create_default_https_context = ssl._create_unverified_context
        driver = self.driver
        driver.maximize_window()
        driver.get(self.url)

        click_keyi_username = driver.find_element_by_xpath("//div[@class='wp-onb-tit']/a[text()='可疑用户']")
        self.webdriverwait_click(driver, click_keyi_username)

        login_button = driver.find_element_by_id('code')
        self.webdriverwait_click(driver, login_button)
        time.sleep(1)

        driver.switch_to.frame(driver.find_element_by_id('tcaptcha_iframe'))  # switch 到 滑块frame
        time.sleep(0.5)
        bk_block = driver.find_element_by_xpath('//img[@id="slideBg"]')  # 大图
        web_image_width = bk_block.size
        web_image_width = web_image_width['width']
        bk_block_x = bk_block.location['x']

        slide_block = driver.find_element_by_xpath('//img[@id="slideBlock"]')  # 小滑块
        slide_block_x = slide_block.location['x']

        bk_block = driver.find_element_by_xpath('//img[@id="slideBg"]').get_attribute('src')       # 大图 url
        slide_block = driver.find_element_by_xpath('//img[@id="slideBlock"]').get_attribute('src')  # 小滑块 图片url
        slid_ing = driver.find_element_by_xpath('//div[@id="tcaptcha_drag_thumb"]')  # 滑块

        os.makedirs('./image/', exist_ok=True)
        self.urllib_download(bk_block, './image/bkBlock.png')
        self.urllib_download(slide_block, './image/slideBlock.png')
        time.sleep(0.5)
        img_bkblock = Image.open('./image/bkBlock.png')
        real_width = img_bkblock.size[0]
        width_scale = float(real_width) / float(web_image_width)
        position = self.get_postion('./image/bkBlock.png', './image/slideBlock.png')
        real_position = position[1] / width_scale
        real_position = real_position - (slide_block_x - bk_block_x)
        track_list = self.get_track(real_position + 4)

        ActionChains(driver).click_and_hold(on_element=slid_ing).perform()  # 点击鼠标左键，按住不放
        time.sleep(0.2)
        # print('第二步,拖动元素')
        for track in track_list:
            ActionChains(driver).move_by_offset(xoffset=track, yoffset=0).perform()  # 鼠标移动到距离当前位置（x,y）
            time.sleep(0.002)
        # ActionChains(driver).move_by_offset(xoffset=-random.randint(0, 1), yoffset=0).perform()   # 微调，根据实际情况微调
        time.sleep(1)
        # print('第三步,释放鼠标')
        ActionChains(driver).release(on_element=slid_ing).perform()
        time.sleep(1)

        print('登录成功')
        #self.after_quit()


if __name__ == '__main__':
    phone = "****"
    login = Login()
    login.login_main()

# coding:utf-8
# 这是jieba的测试文件 jieba 用于中文分词
# docx try

from docx import Document
from docx.shared import Inches  
import os
import jieba
import re
  
document = Document()  # 首先这是包的主要接口，这应该是利用的设计模式的一种，用来创建docx文档，里面也可以包含文档路径(d:\\2.docx)  
  
document.add_heading('Document Title', 0)  # 这里是给文档添加一个标题，0表示 样式为title，1则为忽略，其他则是Heading{level},具体可以去<a href="https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html" target="_blank">官网</a>查;  
  
p = document.add_paragraph('A plain paragraph having some ') # 这里是添加一个段落  
p.add_run('bold').bold = True   # 这里是在这个段落p里文字some后面添加bold字符  
p.add_run(' and some ')  
p.add_run('italic.').italic = True  
  
document.add_heading('Heading, level 1', level=1)   # 这里是添加标题1  
document.add_page_break()                           # 添加分页符  
document.save('demo.docx')                          # 保存这个文档  

pattern = '（.*）专业'
string  = '崔源的专业是 计算机科学专业。'

Surname_List = ['赵','钱','孙','李','周','吴','郑','王','冯','陈','褚','卫','蒋','沈','韩','杨','朱','秦','尤','许','何','吕','施','张','孔','曹','严','金','魏','陶','姜','戚','谢','邹','苏','潘','葛','奚','范','彭','郎','鲁','韦','昌','马','苗','方','俞','任','袁','柳','酆','鲍','史','唐','费','廉','岑','薛','雷','贺','倪','汤','滕','殷','罗','毕','郝','邬','安','常','乐','于','时','傅','皮','齐','康','余','卜','顾','孟','平','黄','穆','萧','尹','姚','邵','汪','祁','毛','狄','米','贝','明','臧','成','戴','宋','茅','庞','熊','纪','舒','屈','项','祝','董','梁','杜','阮','蓝','闵','席','季','麻','贾','路','娄','危','江','童','颜','郭','梅','盛','林','徐','邱','骆','高','夏','蔡','田','樊','胡','凌','霍','虞','万','柯','管','卢','莫','房','丁','宣','邓','郁','单','杭','洪','包','诸','石','崔','吉','钮','龚','程','嵇','邢','裴','陆','翁','芮','靳','松','井','段','富','焦','巴','谷','车','全','郗','池','秋','仲','伊','宁','仇','栾','甘','祖','武','符','刘','景','詹','龙','叶','幸','韶','黎','溥','庄','白']

Surname_Dict = dict(zip(Surname_List, range(len(Surname_List))))
TmpStr = "傻逼秋"
# print (Surname_Dict.get(TmpStr[0]) > 0)

# jieba.lcut 可以将字符串分解为字段 输出结果为 ['liucuiyuan3321', '@', 'outlook', '.', 'com']
print(jieba.lcut(string, cut_all = True))

result = re.findall(r'[。][^。]*[。]'.format("专业"), string)
print(result)
s = '^'+'|'.join(Surname_Dict)
print(s)
print((TmpStr[0] in Surname_List))

# coding:utf-8
# 目前还缺乏研究 如果有多个专业应该怎么处理
# 多种方式比对

# 信息 先分块 后解析 准确率和效率提升

import os
import re
from   xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb             
import sys        
import datetime
import pyDataverse as pd
import json
import sys
# import provinces

################################################################################################
# PowerBi dataverse
BASE_URL  = "https://globaldisco.crm5.dynamics.com/api/discovery/v2.0/Instances"
API_TOKEN = "https://org61624faf.api.crm5.dynamics.com/api/data/v9.2"

# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\CV_Automation\ResumeRespo"  

PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构

################################################################################################
# 参考集 字典

# 个人筛选的 200 个常用姓氏 生成字典
Surname_List = [
        '赵','钱','孙','李','周','吴','郑','王','冯','陈','褚','卫','蒋','沈','韩','杨','朱','秦','尤','许','何','吕','施','张','孔','曹','严','金','魏','陶','姜','戚','谢','邹','苏','潘','葛','奚','范','彭','郎','鲁','韦','昌','马','苗','方','俞','任','袁','柳','酆','鲍','史','唐','费','廉','岑','薛','雷','贺','倪','汤','滕','殷','罗','毕','郝','邬','安','常','乐','于','时','傅','皮','齐','康','余','卜','顾','孟','平','黄','穆','萧','尹','姚','邵','汪','祁','毛','狄','米','贝','明','臧','成','戴','宋','茅','庞','熊','纪','舒','屈','项','祝','董','梁','杜','阮','蓝','闵','席','季','麻','贾','路','娄','危','江','童','颜','郭','梅','盛','林','徐','邱','骆','高','夏','蔡','田','樊','胡','凌','霍','虞','万','柯','管','卢','莫','房','丁','宣','邓','郁','单','杭','洪','包','诸','石','崔','吉','钮','龚','程','嵇','邢','裴','陆','翁','芮','靳','松','井','段','富','焦','巴','谷','车','全','郗','班','秋','仲','伊','宁','仇','栾','甘','祖','武','符','刘','景','詹','龙','叶','幸','韶','黎','溥','庄','白'
        ]

Surname_Dict = dict(zip(Surname_List, range(len(Surname_List)))) # 字典: {'赵':0,'钱':1,'孙':2,'李':3, ...}
# 专业
Major_List = [
        '软件工程','计算机软件','计算机硬件','互联网','通信','电信','网络资源','计算机科学与技术'
        ]

# 技能
Skillset_List = [
        'Java', 'C', 'WEB', 'SQL', 'EJB', 'Cpp', 'C#', 'dotnet', 'RPA', 'Python', 'HTML', 'Html', 'CSS', 'JavaScript', 'R', '外语', 'Office', '项目'
        ]

# 地点
Location_List = [
        '成都', '广州'
        ]

# 来源
Vendor_List = [
        '猎聘', '智联', '前程', '领英', '51'
        ]

################################################################################################
# 子函数
################################################################################################
# 抽取器 抽取单个文件的信息
class Extractor (object):  
        # 读取文件目录
        def __init__ (self, file_dir):
                
                self.fullWord = []
                self.fullText = ""
                self.file_dir = file_dir 
                
                if os.path.splitext(self.file_dir)[1] == ".pdf":
                        pdf = pb.open(self.file_dir)
                
                for page in pdf.pages:
                        self.fullWord += page.extract_words()
                        self.fullText += page.extract_text() if page.extract_text() else ""
                        
                pdf.close()
        
        # 功能函数 读取一个段落知道某一行的长度只有不到4位中文字符
        def __readUntil (text, length):
                return ""
        
        # 必要部分：姓名 应聘职位 专业 联系电话 附件下载 来源 性别
        # 01 搜索姓名函数  Name
        def __search_Name (self):
                
                result = ""
                names = []
                full_text = self.fullText
                
                # 查看是否在文件名下 但是3位容易出现 4位名字扫不到 反之 出现李强简历之类的 
                dir_Set = re.findall(r"[\u4e00-\u9fa5]{2,3}", ((self.file_dir).split("\\"))[-1] )
                if (len(dir_Set) > 0):
                        for TempDir in dir_Set: 
                                if (TempDir[0] in Surname_List):
                                        return TempDir

                # 查看是否在姓名字段下 一般认为出现在前十五行 所以设置count遍历
                for line in full_text.split("\n"):

                        # 是否在姓名字段下
                        if re.search(r"姓[ ]+名", line):
                                name = re.findall(r"姓[ ]+名[ :\\n]+[\u4e00-\u9fa5]{2,4}", line)[0]
                                names.append(re.sub(r"[姓名:：\s]", "", name))
                                break
                        
                        # 没有姓名字段 则分解该行 看看是不是有带有合适的姓氏的中文词汇
                        else:   
                                regex_str = "[" + "|".join(Surname_List) +"]" +r'[\u4e00-\u9fa5]{1,3}'
                                nameset = re.findall (regex_str, line)
                                if len(nameset): return nameset[0]
                                names += nameset                     

                # 筛选好 names 嫌疑集合 对 names 集合内的元素鉴定是否有姓氏 返回有姓氏的那个
                for TmpName in names: 
                        if (TmpName[0] in Surname_List):
                                result = TmpName
                                return result   
                return result
        
        # 02 搜索应聘职位  Jobs
        def __search_Jobs (self):
                
                result = ""
                jobs = []
                full_text = self.fullText
                JobTitle_List = ["期望职位", "应聘职位", "期望从事职位"]
                
                for line in full_text.split("\n"):
                        # 是否在职位字段下
                        if any(title in line for title in JobTitle_List):
                                for title in JobTitle_List:
                                        if re.search(title, line):
                                                job_List = re.findall(r"\s*" + title + "[:：\s]*[a-z|A-Z|0-9|\u4e00-\u9fa5]{2,14}", line)
                                                if (len(job_List) > 0): 
                                                        job = job_List[0]
                                                        job = re.sub(title + r"[:：*\s]", "", job)
                                                        job = re.sub("\s", "", job)
                                                        jobs.append(job)
                                                        return job
                                                        break
                return ""
        
        # 03 搜索专业函数  Major
        def __search_Major (self):
                majors = []
                major = ""  
                result = ""
                full_text   = self.fullText
                full_words  = self.fullWord
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):
                        
                        # 51 job        
                        if re.search(r"专[ ]+业*", line):
                                majorList = re.findall(r"专[ ]+业[:：\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[专业:：\s]", "", major))
                        
                        # 猎聘通
                        if re.search(r"\s*行[ ]+业*", line):
                                majorList = re.findall(r"\s*行[ ]+业[:：\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[行业:：\s]", "", major))
                        
                        for premajor in Major_List:
                                if premajor in line:
                                        return premajor
                
                if (len(majors) > 0): 
                        if (len(majors[0]) > 0):
                                return majors[0]
                
                # 在正文部分中寻找 带有专业或者系的字段
                for word in full_words:
                        
                        text = ""
                        textMajor = ""
                        if os.path.splitext(self.file_dir)[1] == ".pdf": text = word["text"]
                        else: text = word
                        
                        # 中文专业 尴尬的事情是扫码联系
                        if "专业" or "系" or "技术" in text:
                                for m in re.findall(r"[\u4e00-\u9fa5]{2,10}?(?:专业|系|技术)", text):
                                        if "专业" or "系" or "技术" in m:
                                                majors.append(m)
                                                textMajor = m;
                                                break
                                if textMajor != "": break
                        
                        # 英文专业 这一部分还需要修改
                        elif "Bsc" or "Major" or "Msc" in text:
                                for m in re.findall(r"[a-Z]{2,5}?(?:(Bsc)|(Msc)|Major)", text):
                                        if "Bsc" or "Major" or "Msc" in m:
                                                majors.append(m)
                                                textMajor = m;
                                                break
                                if textMajor != "": break
                                
                if len(majors) > 0: 
                        for m in majors:
                                if (len(m) == 0): continue
                                result = m
                return result  
        
        # 04 搜索电话信息  
                # Area Code and Telephone 暂时没有想到这里该怎么做 带有区号的和不带区号的 还有 Tail 要研究一下
        def __search_Phone (self):
                # 找到含有11位数字的字符串段
                full_text = self.fullText
                phone   = ""
                number  = ""
                number_List = []
                             
                # 通过关键词查找  去除空格和短横线后 前后的小括号 读取 11 13 14 个连续的数字
                for line in full_text.split("\n"):
                        if re.search(r"电\s*话", line) or re.search(r"手\s*机", line):
                                # 去除标点符号
                                line = re.sub(r"[()（）：:+\-]", "", line)
                                # 选择 11 到 15 位长度的数字
                                number_List = re.findall(r"\d{11,15}", line)
                                
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number                    
                                break
                        
                # 直接通过数字长度查找 返回符合要求的集合
                        if phone == "":
                                text   = re.sub(r"[()（）+\-]", "", full_text)
                                phones = re.findall(r"\d{11,15}", text)
                                phone  = ",".join(set(phones))
                return phone
        
        # 06 确认来源信息  Vendor
        def __search_Vendor (self):
                
                directory = self.file_dir
                full_text = self.fullText
                
                # 在目录中寻找
                for vendor in Vendor_List:
                        if vendor in self.file_dir:
                                return vendor

                # 在字段中寻找
                count = 0
                for line in full_text.split("\n"):
                        if (count > 20): break
                        for vendor in Vendor_List:
                                if vendor in line: return vendor
                        count += 1

                return ""
        
        # 07 搜索性别函数  Gender 没写男女就只能通过照片去判断
        def __search_Gender (self):
                gender = "" 
                full_text  = self.fullText
                full_words  = self.fullWord
                counter = 0
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 15): break

                        # 性别字段
                        if re.search(r"性[ ]+别*", line):
                                gender = re.findall(r"性[ ]+别[:：\s]*[\u4e00-\u9fa5]{2,10}", line)[0]
                        
                        # 识别到男性字段
                        if re.search(r"男", line) or re.search(r"Male", line): 
                                gender = "男"
                                return gender
                        
                        # 识别到女性字段
                        if re.search(r"女", line) or re.search(r"Female", line): 
                                gender = "女"
                                return gender
                        
                        counter += 1                                
                return gender
        
        # 可选部分: 
        # 08 搜索年龄函数  Age
        def __search_Age (self):
                
                Curr_Year = datetime.datetime.now().year
                number = ""
                full_text  = self.fullText
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        
                        # 获取出生年月
                        if re.search(r"出生年月", line):
                                number_List = re.findall(r"\d{4,4}", line)
                                if (len(number_List) > 0): number = number_List[0]
                                Age = Curr_Year - int(number)
                                return str(Age) 
                                break
                        
                        # 获取岁
                        if re.search(r"\s*岁", line):
                                number_List = re.findall(r"\d{1,2}", line)
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number  
                                break
                        
                return ""
        
        # 09 判断在职状态  Condition
        def __search_Condition (self):
                full_text  = self.fullText
                counter = 0
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 20): break
                        if re.search(r"离职", line): return "离职"
                        if re.search(r"正在找工作", line): return "正在找工作"
                        if re.search(r"在职", line): return "在职"
                        counter += 1                                
                return ""
        
        # 10 搜索城市函数  Cities
        def __search_City (self):
                
                locations = []
                location = ""  
                full_text  = self.fullText
                
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        
                        if re.search(r"\s*地点", line):
                                
                                loc_List = re.findall(r"\s*地点[:：\s]*[\u4e00-\u9fa5]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地点:：\s]", "", location))
                                break
                        
                        if re.search(r"所在地", line) or re.search(r"现居地", line):
                                
                                loc_List = re.findall(r"\s*地[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地:：\s]", "", location))
                                break
                        
                        if re.search(r"住\s*址", line) or re.search(r"现居住", line)  or re.search(r"Location", line):
                                # 住址
                                loc_List = re.findall(r"住\s*址[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[住址:：\s]", "", location))
                                        break
                                        
                                # 现居住
                                loc_List = re.findall(r"现居住[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[现居住:：\s]", "", location))
                                        break
                                
                                # Location
                                loc_List = re.findall(r"Location[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[Location:：\s]", "", location))
                                        break
                                break
                        
                if (len(locations) > 0): location = locations[0]
                return location
        
        # 11 搜索学历函数  Stage
        def __search_Stage (self):
                
                stage = ""  
                full_text  = self.fullText
                
                setPhd = ["博士"]
                setMsc = ["硕士", "研究生"]
                setBsc = ["大学", "本科"]
                setByd = ["大专", "专科"]
                setOth = ["学院"]
                setSta = setPhd + setMsc + setBsc + setByd + setOth
              
                # 在学历字段中寻找  
                for line in full_text.split("\n"):
                        
                        if (any (TempStr in line for TempStr in setSta)):
                                
                                if (any (TempStr in line for TempStr in setPhd)): stage =  "博士"
                                if (any (TempStr in line for TempStr in setMsc)): stage =  "硕士"
                                if (any (TempStr in line for TempStr in setBsc)): stage =  "本科"
                                if stage != "": return stage
                        
                if stage == "": return "专科"                    
                return stage
        
        # 12 搜索籍贯函数  Hometown
        def __search_Hometown (self):
                hometown    = "" 
                full_text   = self.fullText
                full_words  = self.fullWord
                counter = 0
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 15): break

                        # 籍贯字段
                        if re.search(r"籍[ ]+贯*", line):
                                hometown = re.findall(r"籍[ ]+贯[:：\s]*[\u4e00-\u9fa5]{2,10}", line)[0] 
                return hometown
                
        # 13 搜索自我评价函数  Self-Comment
        def __search_SelfComment (self):
                selfie    = "" 
                '''
                full_text = self.fullWord
                counter = 0
                
                print(full_text)
                # 在专业字段中寻找  
                turn = False
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        # if (counter < 10): continue

                        # 籍贯字段
                        if re.search  (r"自我评价", line):
                                turn = True
                                print ("Yes" + self.file_dir)
                        
                        if (turn == True) and (len(line) > 10):
                                print (line + "\n")
                '''
                return ""
        
        # 14 搜索工作经验函数   Working Experience
        def __search_WorkExperience (self):
                return ""
        
        # 15 搜索教育经历函数   Education Experience
        def __search_EducationExperience (self):
                return ""
        
        # 16 搜索学校函数       School
        def __search_School (self):
                # 这个顺序有讲究的 一般 大学 校区 学院
                College_signs = ["大学", "校区", "学院"]
                Note_signs = ["毕业院校"]
                school = ""  
                school_list = []
                full_text  = self.fullText
                punctuation = ':：|-'

                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 查看是否有相匹配的节点
                        for term in Note_signs:
                                if re.search(term, line):
                                        school_list += re.findall(r"[:：\s]*[\u4e00-\u9fa5]{2,10}", line)
                                        
                        # 看看这行有无关键词 有就加入 用\S避免字符不能识别 先把标点符号替换以区分
                        for term in College_signs:
                                if re.search(r"\s*"+term, line):
                                        line = re.sub('[{}]'.format(punctuation), " ", line)
                                        school_list += re.findall(r"\S{2,10}"+term, line)
                                        
                        # 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大
                        if (len(school_list) > 0):
                                school = re.sub(r"\s", "", school_list[-1])
                                return school
                                break
                
                return ""
        
        # 17 搜索证书函数       Certificate
        def __search_Certificate (self):
                return ""

        # 18 搜索专业技能函数   Skill-Set
        def __search_ProfessionalSkills (self):
                return ""
        
        # 19 搜索期望薪资函数   Expected Salaries
        def __search_Salary (self):

                salary = ""
                Note_signs = ["期望薪资"]
                salary_list = []
                full_text  = self.fullText
                punctuation = ':：|-'

                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 查看是否有相匹配的节点
                        for term in Note_signs:
                                if re.search(term, line):
                                        school_list += re.findall(r"[:：\s]*\S{2,10}", line)
                                        
                        # 关键在 - 左右两边对称 多少到多少
                        if re.search("/月", line):
                                print(line)
                                salary_list += re.findall(r"[0-9\.\s 万]{1,10}-[0-9\.\s 万]{1,10}", line)
                                        
                        # 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大
                        if (len(salary_list) > 0):
                                salary = re.sub("万", "0000", salary_list[-1])
                                salary = re.sub(r"[\s ]", "", salary_list[-1])
                                return salary
                                break
                return ""
        
        # 20 搜索工作年限函数   Working Stages
        def __search_WorkYears (self):
                return ""
        
        # 21 搜索区号函数
        
        # 22 搜索邮箱函数  Email
        def __search_Email (self):
                # 找到含有 @ 和 . 的字符串段
                full_words = self.fullWord
                full_text  = self.fullText
                email = ""
                email_List = []
                newEmail = ""
                
                # 先查看邮箱栏下是否有邮箱可以直接选用
                for line in full_text.split("\n"):
                        
                        if re.search(r"邮[ ]+箱", line):
                                newEmail = re.findall(r"[a-zA-Z0-9_\-.@]+", line)[0]
                                email_List.append(re.sub(r"[邮箱:：\s]", "", newEmail))
                
                if (len(email_List) > 0):
                        for TempEmail in email_List:
                                if '@' in TempEmail:
                                        email = email_List[0] 
                                        return email
                
                # 再遍历所有的 word 寻找邮箱特殊的关键词
                for word in full_words:
                        if os.path.splitext(self.file_dir)[1] == ".pdf":
                                text = word["text"]
                        else:
                                text = word
                        if "@" in text and "." in text:
                                for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):
                                        if "@" in e:
                                                email = e
                                                break
                                if email != "": break
                return email

        # 搜索技能函数  Search Skills
        def __search_Skill (self):
                
                Skills = []
                skill  = ""
                full_text  = self.fullText
                
                for line in full_text.split("\n"):
                        key = ""
                        for keyword in Skillset_List:            
                                if re.search(keyword, line) and (key == ""):
                                        Skills.append(line)
                                        key = "Added"
                
                return Skills
        
        # 入口函数 返回搜索结果
        def search (self):
                # 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个
                sep_dir = re.split(r"/+|\\+", self.file_dir)
                directory = sep_dir[-2]
                file_name = sep_dir[-1]
                
                info = {
                        "Directory": directory, "file_name": file_name, "user_name": "", "email": "", "phone": "", "gender": "", "stage": "", "major": "", "age": "", "city": "", "skill": "", "jobs": "", "vendor": "", "condition": "", "hometown": "", "school": "", "salary": "", "selfComment": ""
                        }
                
                func = {
                        "user_name":    self.__search_Name(),           # 姓名
                        "jobs":         self.__search_Jobs(),           # 应聘职位
                        "major":        self.__search_Major(),          # 专业
                        "phone":        self.__search_Phone(),          # 电话
                        5:              directory,                      # 附件
                        "vendor":       self.__search_Vendor(),         # 来源
                        "gender":       self.__search_Gender(),         # 性别
                        "age":          self.__search_Age(),            # 年龄
                        "condition":    self.__search_Condition(),      # 状态
                        "city":         self.__search_City(),           # 现居地
                        "stage":        self.__search_Stage(),          # 学历
                        "hometown":     self.__search_Hometown(),       # 籍贯
                        "selfComment":  self.__search_SelfComment(),    # 自我评价
                        14:     "",
                        15:     "",
                        "school":       self.__search_School(),         # 学校
                        17:     "",
                        18:     "",
                        "salary":       self.__search_Salary(),
                        20:     "",
                        21:     "",
                        "email":        self.__search_Email(),          # 邮箱
                        23:     "",
                        "skill":        self.__search_Skill(),          # 技能
                }

                for key in info:
                        if (key == "Directory") or (key == "file_name"): continue
                        
                        try:    
                                info[key] = func[key]
                        except Exception as e: 
                                print(e)
                                continue
                
                return info

################################################################################################
# 猎聘
# class Lie-Pin (object):

################################################################################################
# 智联
# class Zhi-Lian (object):
        
################################################################################################
# 前程无忧
# class Qian-Cheng (object):

################################################################################################
# 51jobs
# class Jobs (object):

################################################################################################
# 遍历并读取函数
class Reader (object):
        # 初始化
        def __init__ (self, folder_Path):
                self.path = folder_Path
        
        # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
        def read (self, type):
                ResumePath = []
                allfilelist = os.listdir(self.path)
                
                for file in allfilelist:
                        # 生成简历文件路径 判断是否位文件
                        filepath = os.path.join(FolderPath, file)
                        if os.path.isfile(filepath):
                                # 遍历所有符合type类型的简历
                                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
                                        ResumePath.append(filepath)
                                        filename.append(file)
                return ResumePath

################################################################################################
# 输出生成函数
class Generator (object):
        # 初始化
        def __init__ (self, sourceInfo):
                self.info = sourceInfo
        
        # 打印呈现
        def display (self):
                
                result = self.info
                print("################### Candidate ", counter, " ###################")
                
                # Necessary info
                print("Name     : ", result["user_name"])
                print("Position ：", result["jobs"])
                print("Major    : ", result["major"])
                print("Phone    : ", result["phone"])
                print("Gender   : ", result["gender"])
                print("Source   : ", result["file_name"])
                print("Vendor   : ", result["vendor"])
                print("Condition: ", result["condition"])
                
                # Optional Info
                print("Email    : ", result["email"])
                print("City     : ", result["city"])
                print("Age      : ", result["age"])
                print("Stage    : ", result["stage"])  
                print("Hometown : ", result["hometown"])
                print("School   : ", result["school"])
                print("Salary   : ", result["salary"])
                
                # print("SkillSet : ", "\n".join(result["skill"]))
                print("\n\n\n")
                
        # 生成 Json
        def generate_Json (self):
                try:
                        data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)
                except Exception as e: print(e)
                return data_Json

################################################################################################
# Json形式下的简历信息发布至 dataverse (Power BI)
class dataverse_Publish (object):
        # 初始化
        def __init__ (self, sourceJson):
                self.source = sourceJson
        
        # 主要函数
        def process (self):
                
                sourceFile = "TestJson.json"
                
                # 链接 api 接口
                from pyDataverse.api import NativeApi
                api = NativeApi(BASE_URL, API_TOKEN)
                
                # Create Collection of data
                from pyDataverse.models import Dataverse
                from pyDataverse.utils import read_file
                dv = Dataverse()
                dv.from_json(read_file(sourceFile))
                
                resp = api.create_dataverse (":root", dv.json())
                resp = api.publish_dataverse ("Dataverse_Resumes")
                resp = api.get_dataverse ("Dataverse_Resumes")

################################################################################################
# 杂项函数
class function:
        # 呈现百分比
        def displayPercent (counter, total, turn):
                
                assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))

                # 常规 display, turn == True
                if (turn):
                        percent = float(counter)*100 / float(total)
                        sys.stdout.write("%.4f"%percent);
                        sys.stdout.write("%\r");
                        sys.stdout.flush();
                # 最终 display, turn == False
                else:
                        sys.stdout.write("100%!finish!\n");
                        sys.stdout.flush();
                return ""
                
        # Json 初始化
        def initiateJson (filename):
                
                assert (isinstance(filename, str) and (".json" in filename))
                Json_file = open(filename, 'w', encoding = 'utf-8')
                Json_file.seek(0)       # 定位到 Position 0
                Json_file.truncate()    # 清空 Json 文件
                return Json_file

################################################################################################
# 主函数
if __name__ == "__main__":
        
        # Step 1: 遍历该文件夹下的所有简历文件        
        PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
        DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
        ResumeInfoList = []
        
        # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
        counter   = 0
        Json_file = function.initiateJson ("resume_Result.json")
        total     = len(PdfResumePath)
        
        #         导出简历信息
        for file in PdfResumePath:
                counter = counter + 1
                # if (counter > 2): continue
                ResumeInfoList.append (Extractor(file_dir = file).search())
                function.displayPercent (counter, total, True)
        function.displayPercent (counter, total, False)
        counter = 0
        
        Json_file.write("[\n")
        length = len(ResumeInfoList)
        #         将信息呈现并写入json
        for info in ResumeInfoList:
                counter = counter + 1
                Generator(sourceInfo = info).display()
                Result_Json = Generator(sourceInfo = info).generate_Json()
                Json_file.write(Result_Json)
                if (counter != length): Json_file.write(",")
                Json_file.write("\n")
        Json_file.write("]")
        Json_file.close()
        
        # 复制到仓库中
        # Step 3: 导出到 dataverse
        # dataverse_Publish(sourceJson = Json_filename).process()
        

# https://orgd9c1d674.api.crm5.dynamics.com/api/data/v9.2
# https://org61624faf.api.crm5.dynamics.com/api/data/v9.2

################################################################################################
# 函数 读取信息             
# print (ResumePath[0])
# xingming_node = document_tree.getElementsByTagName("XingMing")[0]
# xingming = xingming_node.childNodes[0].data
                
################################################################################################
# 函数 将一份简历信息写入 Excel 文件
# print (ResumePath)
# print (filename)

# coding:utf-8
# 这个版本是用于简历分栏
'''
新的思路是 我们遍历每一个 text 的内容 然后看是否读取到这个 text 的长度只有4个字长 
遍历是不是在分隔符的集合内 如果是 就在这里分割

如果用表格抽取 好像只有邓的简历可以用这样的方法
'''

import os
import re
from   xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb             
import sys        
import datetime
import pyDataverse as pd
import json
import sys

from collections import OrderedDict

# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\ResumeRespo"  

PdfResumePath = []              # 符合要求的 pdf  简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构

################################################################################################
# 抽取器 抽取单个文件的信息
class Extractor (object):  
        
    # 读取文件信息 输出的 ansList 将我们分解出来的段落放置在一个数组中记录起来
    def __init__ (self, file_dir):
                
        self.fullWord = []
        self.fullText = ""
        self.file_dir = file_dir 
        self.ansList  = []
                
        if os.path.splitext(self.file_dir)[1] == ".pdf":
            pdf = pb.open(self.file_dir)
                
        for page in pdf.pages:
            # 读取词汇
            self.fullWord += page.extract_words()
            # 读取文本信息
            self.fullText += page.extract_text() if page.extract_text() else ""
            # 读取表格
            
        pdf.close()
                
    # 切片函数
    def slide (self):
        pattern_list = ["信息", "评价", "经历", "经验", "信息", "技能", "意向"]
        
        full_text = self.fullText
        source_list = full_text.split('\n')
        paragraph_list = []
        
        # [\u4e00-\u9fa5\S*\u4e00-\u9fa5]{4, 6}
        
        # 依照段落表示分段
        counter = 0
        length  = len(source_list)
        
        currentText  = ""
        paragraph = []
        
        while (counter < length):
            
            # 导出并去除当前句子重复的汉字部分
            line = source_list[counter]
            line = function.quitDuplicate(line)
            # print(line)
            
            # 判断是否符合分割条件 如果符合 则新建一个段落存储 长度小于5 并 包含关键词
            if (len(line) < 5):
                
                # 将前一段文本导进
                if (currentText) : paragraph.append(currentText)
                # 判断是否有分割关键词
                if (any (TempStr in line for TempStr in pattern_list)):
                    old_paragraph = paragraph
                    paragraph_list.append(old_paragraph)
                    paragraph = []
                    paragraph.append(line)
                # 递进
                counter = counter + 1
                currentText = ""
                continue

            currentText += line + "\n"
            # paragraph.append(line)    这一步可以改为增加文本作为列表的元素
            counter = counter + 1
        
        # 加入最后一段
        paragraph.append (currentText)
        paragraph_list.append (paragraph)
        # 将段落列表返回
        
        return paragraph_list
        
    # 入口函数 返回搜索结果
    def search (self):
        # 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个
        sep_dir = re.split(r"/+|\\+", self.file_dir)
        directory = sep_dir[-2]
        file_name = sep_dir[-1]
        self.slide();
                
        info = {"Directory": directory, "file_name": file_name, "para": self.ansList}
        return info
                
################################################################################################
# 遍历并读取函数
class Reader (object):
    # 初始化
    def __init__ (self, folder_Path):
        self.path = folder_Path
        
    # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
    def read (self, type):
        ResumePath = []
        allfilelist = os.listdir(self.path)
                
        for file in allfilelist:
            # 生成简历文件路径 判断是否位文件
            filepath = os.path.join(FolderPath, file)
            if os.path.isfile(filepath):
                # 遍历所有符合type类型的简历
                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
                    ResumePath.append(filepath)
                    filename.append(file)
        
        return ResumePath

################################################################################################
# 输出生成函数
class Generator (object):
    # 初始化
    def __init__ (self, sourceInfo):
        self.info = sourceInfo
        
    # 打印呈现
    def display (self):
                
        result = self.info
        print("Length: " + (str)(len(info["para"])))
        print("################### Candidate ", counter, " ###################")
                
        # print paragraphs
        print(info["para"])
                
        # print("SkillSet : ", "\n".join(result["skill"]))
        print("\n\n\n")
                
################################################################################################
# 杂项函数
class function:
    # 呈现百分比
    def displayPercent (counter, total, turn):
                
        assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))

        # 常规 display, turn == True
        if (turn):
            percent = float(counter)*100 / float(total)
            sys.stdout.write("%.4f"%percent);
            sys.stdout.write("%\r");
            sys.stdout.flush();
        # 最终 display, turn == False
        else:
            sys.stdout.write("100%!finish!\n");
            sys.stdout.flush();
        return ""

    # 去除重复的字符
    def quitDuplicate (source):
        # return source
        counter = 1
        while (counter < len(source)):
            if (source[counter] == source[counter - 1]):
                # 额外需要增加的功能 是否是名字的判断
                if '\u4e00' <= source[counter] <= '\u9fff':
                    oldstr = source
                    newstr = oldstr[:counter] + "" + oldstr[counter + 1:]
                    source = newstr
            counter = counter + 1
        
        return source
        
################################################################################################
# 主函数
if __name__ == "__main__":
        
    # Step 1: 遍历该文件夹下的所有简历文件        
    PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
    DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
    ResumeInfoList = []
        
    # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
    counter   = 0
    total     = len(PdfResumePath)
    ResumeInfoList = []
    
    #         导出简历信息
    for file in PdfResumePath:
        counter = counter + 1
        # if (counter > 1): continue
        
        ResumeInfoList.append (Extractor(file_dir = file).search())
        function.displayPercent (counter, total, True)

    function.displayPercent (counter, total, False)
    counter = 0    
    
    #         将信息呈现并写入json
    for info in ResumeInfoList:
        counter = counter + 1
        Generator(sourceInfo = info).display()

# coding:utf-8
# 版本04 试验对简历信息的分栏处理

import os
import re
from xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb             
import sys        
import datetime
import pyDataverse as pd
import json
# import provinces

################################################################################################
# PowerBi dataverse
BASE_URL  = "https://globaldisco.crm5.dynamics.com/api/discovery/v2.0/Instances"
API_TOKEN = "https://org61624faf.api.crm5.dynamics.com/api/data/v9.2"

# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\ResumeRespo"  

PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构

################################################################################################
# 参考集 字典

        # 个人筛选的 200 个常用姓氏 生成字典
        Surname_List = ['赵','钱','孙','李','周','吴','郑','王','冯','陈','褚','卫','蒋','沈','韩','杨','朱','秦','尤','许','何','吕','施','张','孔','曹','严','金','魏','陶','姜','戚','谢','邹','苏','潘','葛','奚','范','彭','郎','鲁','韦','昌','马','苗','方','俞','任','袁','柳','酆','鲍','史','唐','费','廉','岑','薛','雷','贺','倪','汤','滕','殷','罗','毕','郝','邬','安','常','乐','于','时','傅','皮','齐','康','余','卜','顾','孟','平','黄','穆','萧','尹','姚','邵','汪','祁','毛','狄','米','贝','明','臧','成','戴','宋','茅','庞','熊','纪','舒','屈','项','祝','董','梁','杜','阮','蓝','闵','席','季','麻','贾','路','娄','危','江','童','颜','郭','梅','盛','林','徐','邱','骆','高','夏','蔡','田','樊','胡','凌','霍','虞','万','柯','管','卢','莫','房','丁','宣','邓','郁','单','杭','洪','包','诸','石','崔','吉','钮','龚','程','嵇','邢','裴','陆','翁','芮','靳','松','井','段','富','焦','巴','谷','车','全','郗','班','秋','仲','伊','宁','仇','栾','甘','祖','武','符','刘','景','詹','龙','叶','幸','韶','黎','溥','庄','白']

        Surname_Dict = dict(zip(Surname_List, range(len(Surname_List)))) # 字典: {'赵':0,'钱':1,'孙':2,'李':3, ...}

        # 专业
        Major_List = ['软件工程','计算机软件','计算机硬件','互联网','通信','电信','网络资源','计算机科学与技术']

        # 技能
        Skillset_List = ['Java', 'C', 'WEB', 'SQL', 'EJB', 'Cpp', 'C#', 'dotnet', 'RPA', 'Python', 'HTML', 'Html', 'CSS', 'JavaScript', 'R', '外语', 'Office', '项目']

        # 地点
        Location_List = ['成都', '广州']

        # 来源
        Vendor_List = ['猎聘', '智联', '前程', '领英', '51']

################################################################################################
# 子函数
################################################################################################
# 抽取器 抽取单个文件的信息
class Extractor (object):  
        # 读取文件目录
        def __init__ (self, file_dir):
                
                self.fullWord = []
                self.fullText = ""
                self.file_dir = file_dir 
                
                if os.path.splitext(self.file_dir)[1] == ".pdf":
                        pdf = pb.open(self.file_dir)
                
                for page in pdf.pages:
                        self.fullWord += page.extract_words()
                        self.fullText += page.extract_text() if page.extract_text() else ""
                        
                pdf.close()
        
        # 必要部分：姓名 应聘职位 专业 联系电话 附件下载 来源 性别
        # 01 搜索姓名函数  Name
        def __search_Name (self):
                
                result = ""
                names = []
                full_text = self.fullText
                
                # 查看是否在文件名下 但是3位容易出现 4位名字扫不到 反之 出现李强简历之类的 
                dir_Set = re.findall(r"[\u4e00-\u9fa5]{2,3}", ((self.file_dir).split("\\"))[-1] )
                if (len(dir_Set) > 0):
                        for TempDir in dir_Set: 
                                if (TempDir[0] in Surname_List):
                                        return TempDir

                # 查看是否在姓名字段下 一般认为出现在前十五行 所以设置count遍历
                for line in full_text.split("\n"):

                        # 是否在姓名字段下
                        if re.search(r"姓[ ]+名", line):
                                name = re.findall(r"姓[ ]+名[ :\\n]+[\u4e00-\u9fa5]{2,4}", line)[0]
                                names.append(re.sub(r"[姓名:：\s]", "", name))
                                break
                        
                        # 没有姓名字段 则分解该行 看看是不是有带有合适的姓氏的中文词汇
                        else:   
                                regex_str = "[" + "|".join(Surname_List) +"]" +r'[\u4e00-\u9fa5]{1,3}'
                                nameset = re.findall (regex_str, line)
                                if len(nameset): return nameset[0]
                                names += nameset                     

                # 筛选好 names 嫌疑集合 对 names 集合内的元素鉴定是否有姓氏 返回有姓氏的那个
                for TmpName in names: 
                        if (TmpName[0] in Surname_List):
                                result = TmpName
                                return result   
                return result
        
        # 02 搜索应聘职位  Jobs
        def __search_Jobs (self):
                
                result = ""
                jobs = []
                full_text = self.fullText
                JobTitle_List = ["期望职位", "应聘职位", "期望从事职位"]
                
                for line in full_text.split("\n"):
                        # 是否在职位字段下
                        if any(title in line for title in JobTitle_List):
                                for title in JobTitle_List:
                                        if re.search(title, line):
                                                job_List = re.findall(r"\s*" + title + "[:：\s]*[a-z|A-Z|0-9|\u4e00-\u9fa5]{2,14}", line)
                                                if (len(job_List) > 0): 
                                                        job = job_List[0]
                                                        job = re.sub(title + r"[:：*\s]", "", job)
                                                        job = re.sub("\s", "", job)
                                                        jobs.append(job)
                                                        return job
                                                        break
                return ""
        
        # 03 搜索专业函数  Major
        def __search_Major (self):
                majors = []
                major = ""  
                result = ""
                full_text   = self.fullText
                full_words  = self.fullWord
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):
                        
                        # 51 job        
                        if re.search(r"专[ ]+业*", line):
                                majorList = re.findall(r"专[ ]+业[:：\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[专业:：\s]", "", major))
                        
                        # 猎聘通
                        if re.search(r"\s*行[ ]+业*", line):
                                majorList = re.findall(r"\s*行[ ]+业[:：\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[行业:：\s]", "", major))
                        
                        for premajor in Major_List:
                                if premajor in line:
                                        return premajor
                
                if (len(majors) > 0): 
                        if (len(majors[0]) > 0):
                                return majors[0]
                
                # 在正文部分中寻找 带有专业或者系的字段
                for word in full_words:
                        
                        text = ""
                        textMajor = ""
                        if os.path.splitext(self.file_dir)[1] == ".pdf": text = word["text"]
                        else: text = word
                        
                        # 中文专业 尴尬的事情是扫码联系
                        if "专业" or "系" or "技术" in text:
                                for m in re.findall(r"[\u4e00-\u9fa5]{2,10}?(?:专业|系|技术)", text):
                                        if "专业" or "系" or "技术" in m:
                                                majors.append(m)
                                                textMajor = m;
                                                break
                                if textMajor != "": break
                        
                        # 英文专业 这一部分还需要修改
                        elif "Bsc" or "Major" or "Msc" in text:
                                for m in re.findall(r"[a-Z]{2,5}?(?:(Bsc)|(Msc)|Major)", text):
                                        if "Bsc" or "Major" or "Msc" in m:
                                                majors.append(m)
                                                textMajor = m;
                                                break
                                if textMajor != "": break
                                
                if len(majors) > 0: 
                        for m in majors:
                                if (len(m) == 0): continue
                                result = m
                return result  
        
        # 04 搜索电话信息  
                # Area Code and Telephone 暂时没有想到这里该怎么做 带有区号的和不带区号的 还有 Tail 要研究一下
        def __search_Phone (self):
                # 找到含有11位数字的字符串段
                full_text = self.fullText
                phone   = ""
                number  = ""
                number_List = []
                             
                # 通过关键词查找
                for line in full_text.split("\n"):
                        if re.search(r"电\s*话", line) or re.search(r"手\s*机", line):
                                # 去除标点符号
                                line = re.sub(r"[()（）：:+\-]", "", line)
                                # 选择 11 到 15 位长度的数字
                                number_List = re.findall(r"\d{11,15}", line)
                                
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number                    
                                break
                        
                # 直接通过数字长度查找 返回符合要求的集合
                        if phone == "":
                                text   = re.sub(r"[()（）+\-]", "", full_text)
                                phones = re.findall(r"\d{11,15}", text)
                                phone  = ",".join(set(phones))
                return phone
        
        # 06 确认来源信息
        def __search_Vendor (self):
                
                directory = self.file_dir
                full_text = self.fullText
                
                # 在目录中寻找
                for vendor in Vendor_List:
                        if vendor in self.file_dir:
                                return vendor

                # 在字段中寻找
                count = 0
                for line in full_text.split("\n"):
                        if (count > 20): break
                        for vendor in Vendor_List:
                                if vendor in line: return vendor
                        count += 1

                return ""
        
        # 07 搜索性别函数  Gender 没写男女就只能通过照片去判断
        def __search_Gender (self):
                gender = "" 
                full_text  = self.fullText
                full_words  = self.fullWord
                counter = 0
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 15): break

                        # 性别字段
                        if re.search(r"性[ ]+别*", line):
                                gender = re.findall(r"性[ ]+别[:：\s]*[\u4e00-\u9fa5]{2,10}", line)[0]
                        
                        # 识别到男性字段
                        if re.search(r"男", line) or re.search(r"Male", line): 
                                gender = "男"
                                return gender
                        
                        # 识别到女性字段
                        if re.search(r"女", line) or re.search(r"Female", line): 
                                gender = "女"
                                return gender
                        
                        counter += 1                                
                return gender
        
        # 可选部分: 
        # 08 搜索年龄函数
        def __search_Age (self):
                
                Curr_Year = datetime.datetime.now().year
                number = ""
                full_text  = self.fullText
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        
                        # 获取出生年月
                        if re.search(r"出生年月", line):
                                number_List = re.findall(r"\d{4,4}", line)
                                if (len(number_List) > 0): number = number_List[0]
                                Age = Curr_Year - int(number)
                                return str(Age) 
                                break
                        
                        # 获取岁
                        if re.search(r"\s*岁", line):
                                number_List = re.findall(r"\d{1,2}", line)
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number  
                                break
                        
                return ""
        
        # 09 判断在职状态
        def __search_Cond (self):
                full_text  = self.fullText
                counter = 0
              
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 20): break
                        if re.search(r"离职", line): return "离职"
                        if re.search(r"正在找工作", line): return "正在找工作"
                        if re.search(r"在职", line): return "在职"
                        counter += 1                                
                return ""
        
        # 10 搜索城市函数
        def __search_City (self):
                
                locations = []
                location = ""  
                full_text  = self.fullText
                
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        
                        if re.search(r"\s*地点", line):
                                
                                loc_List = re.findall(r"\s*地点[:：\s]*[\u4e00-\u9fa5]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地点:：\s]", "", location))
                                break
                        
                        if re.search(r"所在地", line) or re.search(r"现居地", line):
                                
                                loc_List = re.findall(r"\s*地[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地:：\s]", "", location))
                                break
                        
                        if re.search(r"住\s*址", line) or re.search(r"现居住", line)  or re.search(r"Location", line):
                                # 住址
                                loc_List = re.findall(r"住\s*址[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[住址:：\s]", "", location))
                                        break
                                        
                                # 现居住
                                loc_List = re.findall(r"现居住[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[现居住:：\s]", "", location))
                                        break
                                
                                # Location
                                loc_List = re.findall(r"Location[:：\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[Location:：\s]", "", location))
                                        break
                                break
                        
                if (len(locations) > 0): location = locations[0]
                return location
        
        # 11 搜索学历函数
        def __search_Stage (self):
                
                stage = ""  
                full_text  = self.fullText
                
                setPhd = ["博士"]
                setMsc = ["硕士", "研究生"]
                setBsc = ["大学", "本科"]
                setByd = ["大专", "专科"]
                setOth = ["学院"]
                setSta = setPhd + setMsc + setBsc + setByd + setOth
              
                # 在学历字段中寻找  
                for line in full_text.split("\n"):
                        
                        if (any (TempStr in line for TempStr in setSta)):
                                
                                if (any (TempStr in line for TempStr in setPhd)): stage =  "博士"
                                if (any (TempStr in line for TempStr in setMsc)): stage =  "硕士"
                                if (any (TempStr in line for TempStr in setBsc)): stage =  "本科"
                                if stage != "": return stage
                        
                if stage == "": return "专科"                    
                return stage
        
        # 12 搜索籍贯函数
        def __search_Hometown (self):
                return ""
        
        # 13 搜索
        
        
        # 14 搜索
        
        # 15 搜索
        
        # 16 搜索
        
        # 17 搜索
        
        # 18 搜索
        
        # 19 搜索
        
        # 20 搜索
        
        # 21 搜索
        
        # 22 搜索邮箱函数  Email
        def __search_Email (self):
                # 找到含有 @ 和 . 的字符串段
                full_words  = self.fullWord
                full_text  = self.fullText
                email = ""
                email_List = []
                newEmail = ""
                
                # 先查看邮箱栏下是否有邮箱可以直接选用
                for line in full_text.split("\n"):
                        
                        if re.search(r"邮[ ]+箱", line):
                                newEmail = re.findall(r"[a-zA-Z0-9_\-.@]+", line)[0]
                                email_List.append(re.sub(r"[邮箱:：\s]", "", newEmail))
                
                if (len(email_List) > 0):
                        for TempEmail in email_List:
                                if '@' in TempEmail:
                                        email = email_List[0] 
                                        return email
                
                # 再遍历所有的 word 寻找邮箱特殊的关键词
                for word in full_words:
                        if os.path.splitext(self.file_dir)[1] == ".pdf":
                                text = word["text"]
                        else:
                                text = word
                        if "@" in text and "." in text:
                                for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):
                                        if "@" in e:
                                                email = e
                                                break
                                if email != "": break
                return email

        # 搜索技能函数
        def __search_Skill (self):
                
                Skills = []
                skill  = ""
                full_text  = self.fullText
                
                for line in full_text.split("\n"):
                        key = ""
                        for keyword in Skillset_List:            
                                if re.search(keyword, line) and (key == ""):
                                        Skills.append(line)
                                        key = "Added"
                
                return Skills
        
        # 入口函数 返回搜索结果
        def search (self):
                # 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个
                sep_dir = re.split(r"/+|\\+", self.file_dir)
                directory = sep_dir[-2]
                file_name = sep_dir[-1]
                
                info = {"directory": directory, "file_name": file_name, "user_name": "", "email": "", "phone": "", "gender": "", "stage": "", "major": "", "age": "", "city": "", "skill": "", "jobs": "", "vendor": "", "condition": ""}
                
                

              # 下面的一大段之后想写一个函数替代
                
                # 查找姓名
                try:
                        info["user_name"] = self.__search_Name()
                except Exception as e: print("User_Name: " + e)
                
                # 查找 Email
                try:
                        info["email"] = self.__search_Email()
                except Exception as e: print("Email: " + e)
                
                # 查找 Phone
                try:
                        info["phone"] = self.__search_Phone()             # 无奈之举选择前 11 位 之后需要做实验和讨论
                except Exception as e: print("Phone: " + e)
                
                # 查找 Major
                try:
                        info["major"] = self.__search_Major()
                except Exception as e: print("Major: " + e)
                
                # 查找 Gender
                try:
                        info["gender"] = self.__search_Gender()
                except Exception as e: print("Gender :" + e)
                
                # 查找 Stage
                try:
                        info["stage"] = self.__search_Stage()
                except Exception as e: print("Stage: " + e)
                
                # 查找 City
                try:
                        info["city"] = self.__search_City()
                except Exception as e: print("City: " + e)

                # 查找 Age
                try:
                        info["age"] = self.__search_Age()
                except Exception as e: print("Age: " + e)
                
                # 查找 Skills
                try:
                        info["skill"] = self.__search_Skill()
                except Exception as e: print("Skill: " + e)
                
                # 查找 Jobs
                try:
                        info["jobs"] = self.__search_Jobs()
                except Exception as e: print("Jobs: " + e)
                
                # 查找 Vendor
                try:
                        info["vendor"] = self.__search_Vendor()
                except Exception as e: print("Vendor: " + e)
                
                # 
                
                return info

################################################################################################
# 猎聘
# class Liepin (object):

################################################################################################
# 智联
# class Zhilian (object):
        
################################################################################################
# 前程无忧
# class Qiancheng (object):

################################################################################################
# 51jobs
# class Jobs (object):

################################################################################################
# 遍历并读取函数
class Reader (object):
        # 初始化
        def __init__ (self, folder_Path):
                self.path = folder_Path
        
        # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
        def read (self, type):
                ResumePath = []
                allfilelist = os.listdir(self.path)
                
                for file in allfilelist:
                        # 生成简历文件路径 判断是否位文件
                        filepath = os.path.join(FolderPath, file)
                        if os.path.isfile(filepath):
                                # 遍历所有符合type类型的简历
                                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
                                        ResumePath.append(filepath)
                                        filename.append(file)
                
                return ResumePath

################################################################################################
# 输出生成函数
class Generator (object):
        # 初始化
        def __init__ (self, sourceInfo):
                self.info = sourceInfo
        
        # 打印呈现
        def display (self):
                
                result = self.info
                
                print("################### Candidate ", counter, " ###################")
                
                # Necessary info
                print("Name     : ", result["user_name"])
                print("Position ：", result["jobs"])
                print("Major    : ", result["major"])
                print("Phone    : ", result["phone"])
                print("Gender   : ", result["gender"])
                print("Source   : ", result["file_name"])
                print("Vendor   : ", result["vendor"])
                print("Condition: ", result["condition"])
                
                # Optional Info
                print("Email    : ", result["email"])
                print("City     : ", result["city"])
                print("Age      : ", result["age"])
                print("Stage    : ", result["stage"])  
                # print("SkillSet : ", "\n".join(result["skill"]))
                print("\n\n\n")
                
        # 生成 Json
        def generate_Json (self):
                try:
                        data_Json = json.dumps (self.info, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)
                except Exception as e: print(e)
                return data_Json

################################################################################################
# Json形式下的简历信息发布至 dataverse (Power BI)
class dataverse_Publish (object):
        # 初始化
        def __init__ (self, sourceJson):
                self.source = sourceJson
        
        # 主要函数
        def process (self):
                
                sourceFile = "TestJson.json"
                
                # 链接 api 接口
                from pyDataverse.api import NativeApi
                api = NativeApi(BASE_URL, API_TOKEN)
                
                # Create Collection of data
                from pyDataverse.models import Dataverse
                from pyDataverse.utils import read_file
                dv = Dataverse()
                dv.from_json(read_file(sourceFile))
                
                resp = api.create_dataverse (":root", dv.json())
                resp = api.publish_dataverse ("Dataverse_Resumes")
                resp = api.get_dataverse ("Dataverse_Resumes")

################################################################################################
# 主函数
if __name__ == "__main__":
        
        # Step 1: 遍历该文件夹下的所有简历文件        
        PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
        DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
        
        # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
        counter = 0
        Json_filename = "resume_Result.json"
        Json_file = open(Json_filename, 'w', encoding = 'utf-8')
        Json_file.seek(0)       # 定位到 Position 0
        Json_file.truncate()    # 清空 Json 文件
        
        for file in PdfResumePath:
                # 导出字典下的内容
                counter = counter + 1
                # if (counter < 6): continue
                info = Extractor(file_dir = file).search()
                
                # 呈现内容并写入 Json 文件内
                Generator(sourceInfo = info).display()
                Result_Json = Generator(sourceInfo = info).generate_Json()
                Json_file.write(Result_Json + "\n")
        
        Json_file.close()
        # 复制到仓库中
        
        # Step 3: 导出到 dataverse
        # dataverse_Publish(sourceJson = Json_filename).process()
        

# https://orgd9c1d674.api.crm5.dynamics.com/api/data/v9.2
# https://org61624faf.api.crm5.dynamics.com/api/data/v9.2


################################################################################################
# 函数 读取信息             
# print (ResumePath[0])
# xingming_node = document_tree.getElementsByTagName("XingMing")[0]
# xingming = xingming_node.childNodes[0].data
                
################################################################################################
# 函数 将一份简历信息写入 Excel 文件
# print (ResumePath)
# print (filename)

# Test for pyDataverse

from pyDataverse.api    import NativeApi, DataAccessApi
from pyDataverse.models import Dataverse
from pyDataverse.models import Datafile
from pyDataverse.models import Dataset
from pyDataverse.utils  import read_file
from pyDataverse.api    import NativeApi

# Input: the website of Dataverse and Token
BASE_URL  = "https://org61624faf.api.crm5.dynamics.com/api/data/v9.2"
API_TOKEN = ""

class pyDataFunc (object):
    # 初始化
    def __init__ (self):
        self.baseURL  = BASE_URL
        self.apiTOKEN = API_TOKEN
        
    # 连接api并尝试访问 返回 api()
    def getAPI (self):
        # 要求输入两个字符串
        assert (isinstance(self.baseURL, str) and isinstance(self.apiTOKEN, str))
        
        api  = NativeApi(self.baseURL, self.apiTOKEN)
        response = api.get_info_version()
        
        print (response)
        return response
    
    # 配置数据空间 Dataverse Collection 返回 dataverse()
    def setDataverse (self, filename):
        # 要求输入是一个 json 文件
        assert(isinstance(filename, str) and (".json" in filename))
        
        dv = Dataverse()
        dv.from_json(read_file(filename))
        return dv
    
    # 配置数据集 Dataset  返回 dataset()
    def setDataset (self, filename):
        # 要求输入是一个 json 文件
        assert(isinstance(filename, str) and (".json" in filename))
        
        ds = Dataset()
        ds.from_json(read_file(filename))
        return ds
    
    # 创建数据空间 Dataverse 返回创建结果
    def createDataverse (self, dv, verseLoc, verseName):
        # setName
        assert(isinstance(verseName, str))

        response = api.create_dataverse (verseLoc, dv.json())
        print(response1)
        return response
    
    # 创建数据集 Dataset 返回创建结果
    def createDataset (self, ds, setName):
        # setName
        assert(ds.validate_json() and isinstance(setName, str))

        response = api.create_dataset(setName, ds.json())
        return response
    
    # 获取 pid 返回 pid 结果
    def getPID (self, resp):
        ds_pid = resp.json()["data"]["persistentId"]
        return ds_pid
        
    # 上传数据文件
    def uploadDatafile (self, filename, pid):
        # 要求字符串
        assert(isinstance(filename, str))

        df = Datafile()
        df.set({'pid': pid, 'filename': filename})
        response = api.upload_datafile(pid, filename, df.json())
        print(response.json())
    
#############################################################################################

if __name__ == "__main__":
    
    filename = ""
    
    ds  = pyDataFunc.setDataset (filename)
    rs1 = pyDataFunc.createDataset (ds, "A set name")
    pid = pyDataFunc.getPID (rs)
    rs2 = pyDataFunc.uploadDatafile(filename, pid)
    
    
    # 输出结果 .get()["<term>"]
    # 验证是不是合格的json文件 validate_json()
        
from selenium import webdriver

#######################################################################################
# Deploy Driver
class function (object):
    
    def __init__ (self, inputurl):
        # 配置浏览器
        self.url = inputurl
        self.driver = webdriver.Chrome(r"C:/Alan .AIA/Python/Driver/chromedriver.exe")
        
    def getBing (self):
        self.driver.get("www.bing.com")

#######################################################################################
# Main Function
if __name__ == "__main__":
    function(inputurl = "www.bing.com").getBing()

# -*- coding：utf-8 -*-
# Author: juzstu
# Time: 2019/8/22 0:31

import pandas as pd
import numpy as np
import jieba as jb
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import warnings
from tqdm import tqdm
from joblib import Parallel, delayed

warnings.filterwarnings('ignore')


def modified_jd_df(jd_path):
    tmp_list = []
    tmp_file = open(jd_path, encoding='utf8')
    for i, j in enumerate(tmp_file.readlines()):
        if i == 175425:
            j = j.replace('销售\t|置业顾问\t|营销', '销售|置业顾问|营销')
        tmp = j.split('\t')
        tmp_list.append(tmp)
    tmp_file.close()
    return pd.DataFrame(tmp_list[1:], columns=tmp_list[0])


def get_min_salary(x):
    if len(x) == 12:
        return int(x[:6])
    elif len(x) == 10:
        return int(x[:5])
    elif len(x) == 11:
        return int(x[:5])
    elif len(x) == 9:
        return int(x[:4])
    else:
        return -1


def get_max_salary(x):
    if len(x) == 12:
        return int(x[6:])
    elif len(x) == 10:
        return int(x[5:])
    elif len(x) == 11:
        return int(x[5:])
    elif len(x) == 9:
        return int(x[4:])
    else:
        return -1


def is_same_user_city(df):
    live_city_id = str(df['live_city_id'])
    desire_jd_city = df['desire_jd_city_id']
    return live_city_id in desire_jd_city


def jieba_cnt(df):
    experience = df['experience']
    jd_title = df['jd_title']
    jd_sub_type = df['jd_sub_type']
    if isinstance(experience, str) and isinstance(jd_sub_type, str):
        tmp_set = set(jb.cut_for_search(jd_title)) | set(jb.cut_for_search(jd_sub_type))
        experience = set(jb.cut_for_search(experience))
        tmp_cnt = 0
        for t in tmp_set:
            if t in experience:
                tmp_cnt += 1
        return tmp_cnt
    else:
        return 0


def cur_industry_in_desire(df):
    cur_industry_id = df['cur_industry_id']
    desire_jd_industry_id = df['desire_jd_industry_id']
    if isinstance(cur_industry_id, str) and isinstance(desire_jd_industry_id, str):
        return cur_industry_id in desire_jd_industry_id
    else:
        return -1


def desire_in_jd(df):
    desire_jd_type_id = df['desire_jd_type_id']
    jd_sub_type = df['jd_sub_type']
    if isinstance(jd_sub_type, str) and isinstance(desire_jd_type_id, str):
        return jd_sub_type in desire_jd_type_id
    else:
        return -1


def get_tfidf(df, names, merge_id):
    tfidf_enc_tmp = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec_tmp = tfidf_enc_tmp.fit_transform(df[names])
    svd_tag_tmp = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    tag_svd_tmp = svd_tag_tmp.fit_transform(tfidf_vec_tmp)
    tag_svd_tmp = pd.DataFrame(tag_svd_tmp)
    tag_svd_tmp.columns = [f'{names}_svd_{i}' for i in range(10)]
    return pd.concat([df[[merge_id]], tag_svd_tmp], axis=1)


def get_str(x):
    return ' '.join([i for i in jb.cut(x) if i not in stop_words])


def offline_eval_map(train_df, label, pred_col):
    tmp_train = train_df.copy()
    tmp_train['rank'] = tmp_train.groupby('user_id')[pred_col].rank(ascending=False, method='first')
    tmp_x = tmp_train[tmp_train[label] == 1]
    tmp_x[f'{label}_index'] = tmp_x.groupby('user_id')['rank'].rank(ascending=True, method='first')
    tmp_x['score'] = tmp_x[f'{label}_index'] / tmp_train['rank']
    return  tmp_x.groupby('user_id')['score'].mean().mean()


def sub_on_line(train_, test_, pred, label, cate_cols, is_shuffle=True, use_cate=True):
    print(f'data shape:\ntrain--{train_.shape}\ntest--{test_.shape}')
    n_splits = 5
    folds = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=1024)
    sub_preds = np.zeros((test_.shape[0], folds.n_splits))
    train_[f'{label}_pred'] = 0
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = pred
    print(f'Use {len(pred)} features ...')
    auc_scores = []
    params = {
        'learning_rate': 0.01,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 63,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': 1,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': -1,
        'verbose': -1
    }
    train_user_id = train_['user_id'].unique()
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_user_id), start=1):
        print(f'the {n_fold} training start ...')
        train_x, train_y = train_.loc[train_['user_id'].isin(train_user_id[train_idx]), pred], train_.loc[
            train_['user_id'].isin(train_user_id[train_idx]), label]
        valid_x, valid_y = train_.loc[train_['user_id'].isin(train_user_id[valid_idx]), pred], train_.loc[
            train_['user_id'].isin(train_user_id[valid_idx]), label]
        print(f'for train user:{len(train_idx)}\nfor valid user:{len(valid_idx)}')
        if use_cate:
            dtrain = lgb.Dataset(train_x, label=train_y, categorical_feature=cate_cols)
            dvalid = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cate_cols)
        else:
            dtrain = lgb.Dataset(train_x, label=train_y)
            dvalid = lgb.Dataset(valid_x, label=valid_y)

        clf = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=10000,
            valid_sets=[dvalid],
            early_stopping_rounds=100,
            verbose_eval=100
        )
        sub_preds[:, n_fold - 1] = clf.predict(test_[pred], num_iteration=clf.best_iteration)
        auc_scores.append(clf.best_score['valid_0']['auc'])
        fold_importance_df[f'fold_{n_fold}_imp'] = clf.feature_importance()
        train_.loc[train_['user_id'].isin(train_user_id[valid_idx]), f'{label}_pred'] = \
            clf.predict(valid_x, num_iteration=clf.best_iteration)

    five_folds = [f'fold_{f}_imp' for f in range(1, n_splits + 1)]
    fold_importance_df['avg_imp'] = fold_importance_df[five_folds].mean(axis=1)
    fold_importance_df.sort_values(by='avg_imp', ascending=False, inplace=True)
    fold_importance_df[['Feature', 'avg_imp']].to_csv('feat_imp_base.csv', index=False, encoding='utf8')
    test_[label] = np.mean(sub_preds, axis=1)
    print('auc score', np.mean(auc_scores))
    return test_[['user_id', 'jd_no', label]], train_[['user_id', 'jd_no', f'{label}_pred', label]]


#############################################################################################################
# MAIN FUNCTION

if __name__ == "__main__":
    min_work_year = {103: 1, 305: 3, 510: 5, 1099: 10}
    max_work_year = {103: 3, 305: 5, 510: 10}
    degree_map = {'其他': 0, '初中': 1, '中技': 2, '中专': 2, '高中': 2, '大专': 3, '本科': 4,
                  '硕士': 5, 'MBA': 5, 'EMBA': 5, '博士': 6}

    sub_path = './submit/'
    train_data_path = './zhaopin_round1_train_20190716/'
    test_data_path = './zhaopin_round1_test_20190716/'
    train_user = pd.read_csv(train_data_path + 'table1_user', sep='\t')
    train_user['desire_jd_city_id'] = train_user['desire_jd_city_id'].apply(lambda x: re.findall('\d+', x))
    train_user['desire_jd_salary_id'] = train_user['desire_jd_salary_id'].astype(str)
    train_user['min_desire_salary'] = train_user['desire_jd_salary_id'].apply(get_min_salary)
    train_user['max_desire_salary'] = train_user['desire_jd_salary_id'].apply(get_max_salary)
    train_user['min_cur_salary'] = train_user['cur_salary_id'].apply(get_min_salary)
    train_user['max_cur_salary'] = train_user['cur_salary_id'].apply(get_max_salary)
    train_user.drop(['desire_jd_salary_id', 'cur_salary_id'], axis=1, inplace=True)
    train_jd = pd.read_csv(train_data_path + 'table2_jd.csv', sep='\t')
    train_jd.drop(['company_name', 'max_edu_level', 'is_mangerial', 'resume_language_required'], axis=1, inplace=True)

    train_jd['min_work_year'] = train_jd['min_years'].map(min_work_year)
    train_jd['max_work_year'] = train_jd['min_years'].map(max_work_year)
    train_jd['start_date'].replace(r'\N', '22000101', inplace=True)
    train_jd['end_date'].replace(r'\N', '22000101', inplace=True)
    train_jd['start_date'] = pd.to_datetime(train_jd['start_date'].astype(str).apply(lambda x:
                                                                                     f'{x[:4]}-{x[4:6]}-{x[6:]}'))
    train_jd['end_date'] = pd.to_datetime(train_jd['end_date'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:]}'))
    train_jd.loc[train_jd['end_date'] == '2200-01-01', ['start_date', 'end_date']] = np.nan

    stop_words = [i.strip() for i in open('中文停用词表.txt', 'r', encoding='utf8').readlines()]
    stop_words.extend(['\n', '\xa0', '\u3000', '\u2002'])
    tmp_cut = Parallel(n_jobs=-1)(delayed(get_str)(train_jd.loc[ind]['job_description\n'])
                                  for ind in tqdm(train_jd.index))

    tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf_enc.fit_transform(tmp_cut)
    svd_tag = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    tag_svd = svd_tag.fit_transform(tfidf_vec)
    tag_svd = pd.DataFrame(tag_svd)
    tag_svd.columns = [f'desc_svd_{i}' for i in range(10)]
    train_jd = pd.concat([train_jd, tag_svd], axis=1)

    train_action = pd.read_csv(train_data_path + 'table3_action', sep='\t')
    train_action['user_jd_cnt'] = train_action.groupby(['user_id', 'jd_no'])['jd_no'].transform('count').values
    train_action['jd_cnt'] = train_action.groupby(['user_id'])['jd_no'].transform('count').values
    train_action['jd_nunique'] = train_action.groupby(['user_id'])['jd_no'].transform('nunique').values
    train_action = train_action.drop_duplicates()
    train_action.sort_values(['user_id', 'jd_no', 'delivered', 'satisfied'], inplace=True)
    train_action = train_action.drop_duplicates(subset=['user_id', 'jd_no'], keep='last')
    train_action = train_action[train_action['jd_no'].isin(train_jd['jd_no'].unique())]

    train = train_action.merge(train_user, on='user_id', how='left')
    train = train.merge(train_jd, on='jd_no', how='left')
    del train['browsed']

    print('train data base feats already generated ...')

    test_user = pd.read_csv(test_data_path + 'user_ToBePredicted', sep='\t')
    test_user['desire_jd_city_id'] = test_user['desire_jd_city_id'].apply(lambda x: re.findall('\d+', x))
    test_user['desire_jd_salary_id'] = test_user['desire_jd_salary_id'].astype(str)
    test_user['min_desire_salary'] = test_user['desire_jd_salary_id'].apply(get_min_salary)
    test_user['max_desire_salary'] = test_user['desire_jd_salary_id'].apply(get_max_salary)
    test_user['min_cur_salary'] = test_user['cur_salary_id'].apply(get_min_salary)
    test_user['max_cur_salary'] = test_user['cur_salary_id'].apply(get_max_salary)
    test_user.drop(['desire_jd_salary_id', 'cur_salary_id'], axis=1, inplace=True)

    test = pd.read_csv(test_data_path + 'zhaopin_round1_user_exposure_B_20190819', sep=' ')
    test['user_jd_cnt'] = test.groupby(['user_id', 'jd_no'])['jd_no'].transform('count').values
    test['jd_cnt'] = test.groupby(['user_id'])['jd_no'].transform('count').values
    test['jd_nunique'] = test.groupby(['user_id'])['jd_no'].transform('nunique').values
    test = test.drop_duplicates()

    test['delivered'] = -1
    test['satisfied'] = -1

    test = test.merge(test_user, on='user_id', how='left')
    test = test.merge(train_jd, on='jd_no', how='left')

    print('test data base feats already generated ...')

    all_data = train.append(test, sort=False)

    all_data['jd_user_cnt'] = all_data.groupby(['jd_no'])['user_id'].transform('count').values
    all_data['same_user_city'] = all_data.apply(is_same_user_city, axis=1).astype(int)
    all_data['city'].fillna(-1, inplace=True)
    all_data['city'] = all_data['city'].astype(int)
    all_data['same_com_live'] = (all_data['city'] == all_data['live_city_id']).astype(int)
    all_data['min_edu_level'] = all_data['min_edu_level'].apply(lambda x: x.strip() if isinstance(x, str) else x)
    all_data['cur_degree_id'] = all_data['cur_degree_id'].apply(lambda x: x.strip() if isinstance(x, str) else x)
    all_data['min_edu_level_num'] = all_data['min_edu_level'].map(degree_map)
    all_data['cur_degree_id_num'] = all_data['cur_degree_id'].map(degree_map)
    all_data['same_edu'] = (all_data['min_edu_level'] == all_data['cur_degree_id']).astype(int)
    all_data['gt_edu'] = (all_data['cur_degree_id_num'] >= all_data['min_edu_level_num']).astype(int)
    all_data['min_desire_salary_num'] = (all_data['min_desire_salary'] <= all_data['min_salary']).astype(int)
    all_data['min_cur_salary_num'] = (all_data['min_cur_salary'] <= all_data['min_salary']).astype(int)

    all_data['max_desire_salary_num'] = (all_data['max_desire_salary'] <= all_data['max_salary']).astype(int)
    all_data['max_cur_salary_num'] = (all_data['max_cur_salary'] <= all_data['max_salary']).astype(int)
    all_data['same_desire_industry'] = all_data.apply(cur_industry_in_desire, axis=1).astype(int)
    all_data['same_jd_sub'] = all_data.apply(desire_in_jd, axis=1).astype(int)

    all_data['start_month'] = all_data['start_date'].dt.month
    all_data['start_day'] = all_data['start_date'].dt.day
    all_data['end_month'] = all_data['start_date'].dt.month
    all_data['end_day'] = all_data['start_date'].dt.day
    all_data['jd_days'] = (all_data['end_date'] - all_data['start_date']).dt.days

    all_data['user_work_year'] = 2019 - all_data['start_work_date'].replace('-', np.nan).astype(float)
    all_data['gt_min_year'] = (all_data['user_work_year'] > all_data['min_work_year']).astype(int)
    all_data['gt_max_year'] = (all_data['user_work_year'] > all_data['max_work_year']).astype(int)
    all_data['len_experience'] = all_data['experience'].apply(
        lambda x: len(x.split('|')) if isinstance(x, str) else np.nan)
    all_data['desire_jd_industry_id_len'] = all_data['desire_jd_industry_id'].apply(
        lambda x: len(x.split(',')) if isinstance(x, str) else np.nan)
    all_data['desire_jd_type_id_len'] = all_data['desire_jd_type_id'].apply(
        lambda x: len(x.split(',')) if isinstance(x, str) else np.nan)
    all_data['eff_exp_cnt'] = all_data.apply(jieba_cnt, axis=1)
    all_data['eff_exp_ratio'] = all_data['eff_exp_cnt'] / all_data['len_experience']
    all_data.drop(['cur_degree_id_num', 'cur_degree_id', 'desire_jd_city_id', 'min_years',
                   'start_work_date', 'start_date', 'end_date', 'key', 'min_edu_level'], axis=1, inplace=True)

    # 城市统计
    all_data['user_jd_city_nunique'] = all_data.groupby('user_id')['city'].transform('nunique').values
    all_data['jd_user_city_nunique'] = all_data.groupby('jd_no')['live_city_id'].transform('nunique').values

    all_data['jd_title_nunique'] = all_data.groupby('user_id')['jd_title'].transform('nunique').values
    all_data['jd_sub_type_nunique'] = all_data.groupby('user_id')['jd_sub_type'].transform('nunique').values

    all_data['user_desire_jd_industry_id_nunique'] = all_data.groupby('jd_no')['desire_jd_industry_id'].transform(
        'nunique').values
    all_data['user_desire_jd_type_id_nunique'] = all_data.groupby('jd_no')['desire_jd_type_id'].transform(
        'nunique').values

    # 薪资
    all_data['user_jd_min_salary_min'] = all_data.groupby('user_id')['min_salary'].transform('min').values
    all_data['user_jd_min_salary_max'] = all_data.groupby('user_id')['min_salary'].transform('max').values
    all_data['user_jd_min_salary_mean'] = all_data.groupby('user_id')['min_salary'].transform('mean').values
    all_data['user_jd_min_salary_std'] = all_data.groupby('user_id')['min_salary'].transform('std').values

    all_data['user_jd_max_salary_min'] = all_data.groupby('user_id')['max_salary'].transform('min').values
    all_data['user_jd_max_salary_max'] = all_data.groupby('user_id')['max_salary'].transform('max').values
    all_data['user_jd_max_salary_mean'] = all_data.groupby('user_id')['max_salary'].transform('mean').values
    all_data['user_jd_max_salary_std'] = all_data.groupby('user_id')['max_salary'].transform('std').values

    all_data['jd_user_desire_min_salary_min'] = all_data.groupby('jd_no')['min_desire_salary'].transform('min').values
    all_data['jd_user_desire_min_salary_max'] = all_data.groupby('jd_no')['min_desire_salary'].transform('max').values
    all_data['jd_user_desire_min_salary_mean'] = all_data.groupby('jd_no')['min_desire_salary'].transform('mean').values
    all_data['jd_user_desire_min_salary_std'] = all_data.groupby('jd_no')['min_desire_salary'].transform('std').values

    all_data['jd_user_desire_max_salary_min'] = all_data.groupby('jd_no')['max_desire_salary'].transform('min').values
    all_data['jd_user_desire_max_salary_max'] = all_data.groupby('jd_no')['max_desire_salary'].transform('max').values
    all_data['jd_user_desire_max_salary_mean'] = all_data.groupby('jd_no')['max_desire_salary'].transform('mean').values
    all_data['jd_user_desire_max_salary_std'] = all_data.groupby('jd_no')['max_desire_salary'].transform('std').values

    all_data['jd_days_min'] = all_data.groupby('user_id')['jd_days'].transform('min').values
    all_data['jd_days_max'] = all_data.groupby('user_id')['jd_days'].transform('max').values
    all_data['jd_days_mean'] = all_data.groupby('user_id')['jd_days'].transform('mean').values
    all_data['jd_days_std'] = all_data.groupby('user_id')['jd_days'].transform('std').values
    all_data['jd_days_skew'] = all_data.groupby('user_id')['jd_days'].transform('skew').values

    all_data['age_min'] = all_data.groupby('jd_no')['birthday'].transform('min').values
    all_data['age_max'] = all_data.groupby('jd_no')['birthday'].transform('max').values
    all_data['age_mean'] = all_data.groupby('jd_no')['birthday'].transform('mean').values
    all_data['age_std'] = all_data.groupby('jd_no')['birthday'].transform('std').values
    all_data['age_skew'] = all_data.groupby('jd_no')['birthday'].transform('skew').values

    for j in ['jd_title', 'jd_sub_type']:
        le = LabelEncoder()
        all_data[j].fillna('nan', inplace=True)
        all_data[f'{j}_map_num'] = le.fit_transform(all_data[j])

    all_data['experience'] = all_data['experience'].apply(lambda x: ' '.join(x.split('|') if
                                                                             isinstance(x, str) else 'nan'))
    exp_gp = all_data.groupby('jd_no')['experience'].agg(lambda x: ' '.join(x.to_list())).reset_index()
    exp_gp = get_tfidf(exp_gp, 'experience', 'jd_no')
    all_data = all_data.merge(exp_gp, on='jd_no', how='left')

    use_feats = [c for c in all_data.columns if c not in ['user_id', 'jd_no', 'delivered', 'satisfied'] +
                 ['desire_jd_industry_id', 'desire_jd_type_id', 'cur_industry_id', 'cur_jd_type', 'experience',
                 'jd_title', 'jd_sub_type', 'job_description\n']]

    sub_sat, train_pred_sat = sub_on_line(all_data[all_data['satisfied'] != -1], all_data[all_data['satisfied'] == -1],
                                          use_feats, 'satisfied', ['live_city_id', 'city'], use_cate=True)

    sub_dev, train_pred_dev = sub_on_line(all_data[all_data['delivered'] != -1], all_data[all_data['delivered'] == -1],
                                          use_feats, 'delivered', ['live_city_id', 'city'], use_cate=True)

    train_pred_sat['merge_pred'] = train_pred_sat['satisfied_pred'] * 0.8 + train_pred_dev['delivered_pred'] * 0.2
    sub_sat['merge_prob'] = sub_sat['satisfied'] * 0.8 + sub_dev['delivered'] * 0.2

    train_pred_sat = train_pred_sat.merge(all_data[all_data['delivered'] != -1][['user_id', 'jd_no', 'delivered']],
                                          on=['user_id', 'jd_no'], how='left')

    dev_map = offline_eval_map(train_pred_sat, 'delivered', 'merge_pred')
    sat_map = offline_eval_map(train_pred_sat, 'satisfied', 'merge_pred')
    print('dev map:', round(dev_map, 4), 'sat map:', round(sat_map, 4), 'final score:',
          round(0.7 * sat_map + 0.3 * dev_map, 4))

    sub_df = pd.DataFrame(columns=['user_id', 'jd_no', 'merge_prob'])
    for i in sub_sat['user_id'].unique():
        tmp_sub = sub_sat[(sub_sat['user_id'] == i) &
                            (sub_sat['jd_no'].isin(train_jd['jd_no']))].sort_values('merge_prob', ascending=False)[
                            ['user_id', 'jd_no', 'merge_prob']]
        sub_df = sub_df.append(tmp_sub)
        sub_df = sub_df.append(sub_sat[(sub_sat['user_id'] == i) & (~sub_sat['jd_no'].isin(train_jd['jd_no']))][
                                   ['user_id', 'jd_no', 'merge_rank']])
    sub_df[['user_id', 'jd_no']].to_csv('sub_base.csv', index=False)

import pandas as pd
import numpy  as np
import jieba
import json


# This file Asks Json in form [{},{},...,{}]

# Set up paths
FILEPATH = "C:\\Alan .AIA\\Python\\CV_Automation\\resume_Result.json"
CSV_PATH = "C:\\Alan .AIA\\Python\\CV_Automation\\"



# Sub Functions
def inputSource (sourcePath):
    filename = sourcePath.split("\\")[-1]
    if   (".json" in filename):
        targetDS  = pd.read_json(sourcePath, encoding = "utf-8")
        print(targetDS)
        targetDS.to_csv(CSV_PATH + filename.split(".")[0] + ".csv", encoding = "utf_8_sig")
    elif (".csv" in filename):
        targetDS = pd.read_csv(sourcePath, encoding = "utf-8")
    return targetDS

# Main Function
if __name__ == "__main__":
    target = inputSource(sourcePath = FILEPATH)

// C#/MSAL


using Microsoft.Identity.Client;
using Newtonsoft.Json.Linq;
using System;
using System.Net.Http;
using System.Net.Http.Headers;

namespace PowerApps.Samples
{
    class Program
    {
        static void Main()
        {
            // TODO Specify the Dataverse environment name to connect with.
            string resource = "https://<env-name>.api.<region>.dynamics.com";

            // Azure Active Directory app registration shared by all Power App samples.
            // For your custom apps, you will need to register them with Azure AD yourself.
            // See https://docs.microsoft.com/powerapps/developer/data-platform/walkthrough-register-app-azure-active-directory
            var clientId = "51f81489-12ee-4a9e-aaae-a2591f45987d";
            var redirectUri = "app://58145B91-0C36-4500-8554-080854F2AC97";

            #region Authentication

            var authBuilder = PublicClientApplicationBuilder.Create(clientId)
                             .WithAuthority(AadAuthorityAudience.AzureAdMultipleOrgs)
                             .WithRedirectUri(redirectUri)
                             .Build();
            var scope = resource + "/.default";
            string[] scopes = { scope };

            AuthenticationResult token = 
                authBuilder.AcquireTokenInteractive(scopes).ExecuteAsync().Result;
            #endregion Authentication

            #region Client configuration

            var client = new HttpClient
            {
                // See https://docs.microsoft.com/powerapps/developer/data-platform/webapi/compose-http-requests-handle-errors#web-api-url-and-versions
                BaseAddress = new Uri(resource + "/api/data/v9.2/"),
                Timeout = new TimeSpan(0, 2, 0)    // Standard two minute timeout on web service calls.
            };

            // Default headers for each Web API call.
            // See https://docs.microsoft.com/powerapps/developer/data-platform/webapi/compose-http-requests-handle-errors#http-headers
            HttpRequestHeaders headers = client.DefaultRequestHeaders;
            headers.Authorization = new AuthenticationHeaderValue("Bearer", token.AccessToken);
            headers.Add("OData-MaxVersion", "4.0");
            headers.Add("OData-Version", "4.0");
            headers.Accept.Add(
                new MediaTypeWithQualityHeaderValue("application/json"));
            #endregion Client configuration

            #region Web API call

            // Invoke the Web API 'WhoAmI' unbound function.
            // See https://docs.microsoft.com/powerapps/developer/data-platform/webapi/compose-http-requests-handle-errors
            // See https://docs.microsoft.com/powerapps/developer/data-platform/webapi/use-web-api-functions#unbound-functions
            var response = client.GetAsync("WhoAmI").Result;

            if (response.IsSuccessStatusCode)
            {
                // Parse the JSON formatted service response to obtain the user ID.  
                JObject body = JObject.Parse(
                    response.Content.ReadAsStringAsync().Result);
                Guid userId = (Guid)body["UserId"];

                Console.WriteLine("Your user ID is {0}", userId);
            }
            else
            {
                Console.WriteLine("Web API call failed");
                Console.WriteLine("Reason: " + response.ReasonPhrase);
            }
            #endregion Web API call

            // Pause program execution by waiting for a key press.
            Console.ReadKey();
        }
    }
}