12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- import text_processing
- import pandas as pd
- import sklearn
- from sklearn import cluster
- from sklearn.model_selection import train_test_split
- from sklearn import metrics
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.pipeline import Pipeline
- from sklearn.svm import LinearSVC
- from sklearn.metrics import accuracy_score
- ###########
- # TRAINING
- ###########
- # Gets file from Blob Storage and stores it locally
- def createDataframe(path):
- data = pd.read_csv(path,sep="|",encoding='utf-8')
- print("Data frame created.")
- return data
- # Splits dataset into training and testing data
- def split(dataframe):
- train, test = train_test_split(dataframe, test_size=0.33, random_state=42)
- print('Training Data Shape:', train.shape)
- print('Testing Data Shape:', test.shape)
- return train,test
- # Creates pipeline
- def createPipeline():
- vectorizer = CountVectorizer(tokenizer=text_processing.tokenization)
- clf = LinearSVC()
- pipe = Pipeline([('cleanText', text_processing.CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])
- return pipe
- # Returns accuracy
- def getAccuracy(labels,preds):
- return accuracy_score(labels, preds)
- #############
- # PREDICTIONS
- #############
- # Returns predicted label for given text
- def getPrediction(txt,model):
- testTxt = [txt]
- prediction = model.predict(testTxt)[0]
- print("Prediction: ", prediction)
- return prediction
|