model_helpers.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import text_processing
  2. import pandas as pd
  3. import sklearn
  4. from sklearn import cluster
  5. from sklearn.model_selection import train_test_split
  6. from sklearn import metrics
  7. from sklearn.feature_extraction.text import CountVectorizer
  8. from sklearn.pipeline import Pipeline
  9. from sklearn.svm import LinearSVC
  10. from sklearn.metrics import accuracy_score
  11. ###########
  12. # TRAINING
  13. ###########
  14. # Gets file from Blob Storage and stores it locally
  15. def createDataframe(path):
  16. data = pd.read_csv(path,sep="|",encoding='utf-8')
  17. print("Data frame created.")
  18. return data
  19. # Splits dataset into training and testing data
  20. def split(dataframe):
  21. train, test = train_test_split(dataframe, test_size=0.33, random_state=42)
  22. print('Training Data Shape:', train.shape)
  23. print('Testing Data Shape:', test.shape)
  24. return train,test
  25. # Creates pipeline
  26. def createPipeline():
  27. vectorizer = CountVectorizer(tokenizer=text_processing.tokenization)
  28. clf = LinearSVC()
  29. pipe = Pipeline([('cleanText', text_processing.CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])
  30. return pipe
  31. # Returns accuracy
  32. def getAccuracy(labels,preds):
  33. return accuracy_score(labels, preds)
  34. #############
  35. # PREDICTIONS
  36. #############
  37. # Returns predicted label for given text
  38. def getPrediction(txt,model):
  39. testTxt = [txt]
  40. prediction = model.predict(testTxt)[0]
  41. print("Prediction: ", prediction)
  42. return prediction