import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
def getTokens(inputString): # custom tokenizer. ours tokens are characters rather than full words
tokens = []
for i in inputString:
tokens.append(i)
return tokens
filepath = '/data.csv' #path for password file
data = pd.read_csv(filepath, ',', error_bad_lines=False)
data = pd.DataFrame(data)
passwords = np.array(data)
random.shuffle(passwords) # shuffling randomly for robustness
y = [d[1] for d in passwords] # labels
allpasswords = [d[0] for d in passwords] # actual passwords
vectorizer = Tfidf