model_train.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. import text_processing
  2. import model_helpers
  3. import azure_storage_helpers
  4. ##################
  5. # BUILDING DATASET
  6. ##################
  7. # GETTING CONTENT FROM COSMOSDB
  8. # CosmosDB config
  9. # TODO: replace the values with your own (if you don't have a partition key, leave blank)
  10. cosmosConfig = {
  11. 'ENDPOINT': 'YOUR_ENDPOINT',
  12. 'PRIMARYKEY': 'YOUR_PRIMARY_KEY',
  13. 'DATABASE': 'data',
  14. 'COLLECTION': 'documents',
  15. 'PARTITIONKEY': 'name'
  16. }
  17. # Initializing Cosmos client
  18. cosmosClient = azure_storage_helpers.InitializeCosmosClient(cosmosConfig)
  19. # Getting the contents of all documents in the specified collection
  20. documents = azure_storage_helpers.ReadDocuments(cosmosClient,cosmosConfig)
  21. # EXTRACTING CONTENT OF INTEREST
  22. # In this sample, it is supposed that the text is stored as sections inside of pages
  23. # Will contain all the rows to be put in a csv file
  24. data = []
  25. # Column names
  26. firstRow = ['text','label']
  27. data.append(firstRow)
  28. for doc in documents:
  29. pages = doc.get('pages')
  30. for page in pages:
  31. sections = page['sections']
  32. for section in sections:
  33. text = section['text']
  34. text = text_processing.normalizeText(text)
  35. label = section['label']
  36. row = [text,label]
  37. data.append(row)
  38. #################
  39. # STORING DATASET
  40. #################
  41. # Creating CSV file
  42. azure_storage_helpers.createCSV(data,'csvdataset.csv')
  43. # Blob Storage config
  44. # TODO: replace the values with your own
  45. blobConfig = {
  46. 'ACCOUNTNAME': 'YOUR_STORAGE_NAME',
  47. 'KEY': 'YOUR_KEY',
  48. 'CONTAINER': 'main'
  49. }
  50. # Initializing Blob Storage service
  51. blob_service = azure_storage_helpers.InitializeBlobService(blobConfig)
  52. # Uploading dataset to Blob Storage
  53. azure_storage_helpers.uploadFile(blobConfig,blob_service,'dataset.csv','csvdataset.csv')
  54. ################
  55. # TRAINING MODEL
  56. ################
  57. # GETTING DATASET
  58. datasetPath = "csvdataset.csv"
  59. # Getting the dataset from blob storage
  60. # Comment out this line if you want to use a local one
  61. azure_storage_helpers.getFile(blobConfig,blob_service,'dataset.csv','csvdataset.csv')
  62. # Creating the pandas dataframe
  63. df = model_helpers.createDataframe(datasetPath)
  64. # SPLITTING DATASET
  65. train, test = model_helpers.split(df)
  66. # CREATING PIPELINE
  67. pipe = model_helpers.createPipeline()
  68. # TRAINING
  69. train1 = train['text'].tolist()
  70. labelsTrain1 = train['label'].tolist()
  71. pipe.fit(train1, labelsTrain1)
  72. # TESTING
  73. test1 = test['text'].tolist()
  74. labelsTest1 = test['label'].tolist()
  75. preds = pipe.predict(test1)
  76. accuracy = model_helpers.getAccuracy(labelsTest1,preds)
  77. print("Accuracy:", accuracy)
  78. ######################
  79. # SAVING TRAINED MODEL
  80. ######################
  81. # Uploads trained model to Blob Storage as well and saves it locally
  82. azure_storage_helpers.uploadPickle(blobConfig,blob_service,pipe,"model.pkl","model.pkl")