123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- import text_processing
- import model_helpers
- import azure_storage_helpers
- ##################
- # BUILDING DATASET
- ##################
- # GETTING CONTENT FROM COSMOSDB
- # CosmosDB config
- # TODO: replace the values with your own (if you don't have a partition key, leave blank)
- cosmosConfig = {
- 'ENDPOINT': 'YOUR_ENDPOINT',
- 'PRIMARYKEY': 'YOUR_PRIMARY_KEY',
- 'DATABASE': 'data',
- 'COLLECTION': 'documents',
- 'PARTITIONKEY': 'name'
- }
- # Initializing Cosmos client
- cosmosClient = azure_storage_helpers.InitializeCosmosClient(cosmosConfig)
- # Getting the contents of all documents in the specified collection
- documents = azure_storage_helpers.ReadDocuments(cosmosClient,cosmosConfig)
-
- # EXTRACTING CONTENT OF INTEREST
- # In this sample, it is supposed that the text is stored as sections inside of pages
- # Will contain all the rows to be put in a csv file
- data = []
- # Column names
- firstRow = ['text','label']
- data.append(firstRow)
- for doc in documents:
- pages = doc.get('pages')
- for page in pages:
- sections = page['sections']
- for section in sections:
- text = section['text']
- text = text_processing.normalizeText(text)
- label = section['label']
- row = [text,label]
- data.append(row)
- #################
- # STORING DATASET
- #################
- # Creating CSV file
- azure_storage_helpers.createCSV(data,'csvdataset.csv')
- # Blob Storage config
- # TODO: replace the values with your own
- blobConfig = {
- 'ACCOUNTNAME': 'YOUR_STORAGE_NAME',
- 'KEY': 'YOUR_KEY',
- 'CONTAINER': 'main'
- }
- # Initializing Blob Storage service
- blob_service = azure_storage_helpers.InitializeBlobService(blobConfig)
- # Uploading dataset to Blob Storage
- azure_storage_helpers.uploadFile(blobConfig,blob_service,'dataset.csv','csvdataset.csv')
- ################
- # TRAINING MODEL
- ################
- # GETTING DATASET
- datasetPath = "csvdataset.csv"
- # Getting the dataset from blob storage
- # Comment out this line if you want to use a local one
- azure_storage_helpers.getFile(blobConfig,blob_service,'dataset.csv','csvdataset.csv')
- # Creating the pandas dataframe
- df = model_helpers.createDataframe(datasetPath)
- # SPLITTING DATASET
- train, test = model_helpers.split(df)
- # CREATING PIPELINE
- pipe = model_helpers.createPipeline()
- # TRAINING
- train1 = train['text'].tolist()
- labelsTrain1 = train['label'].tolist()
- pipe.fit(train1, labelsTrain1)
- # TESTING
- test1 = test['text'].tolist()
- labelsTest1 = test['label'].tolist()
- preds = pipe.predict(test1)
- accuracy = model_helpers.getAccuracy(labelsTest1,preds)
- print("Accuracy:", accuracy)
- ######################
- # SAVING TRAINED MODEL
- ######################
- # Uploads trained model to Blob Storage as well and saves it locally
- azure_storage_helpers.uploadPickle(blobConfig,blob_service,pipe,"model.pkl","model.pkl")
|