blob-upload-2.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. import requests
  2. from bs4 import BeautifulSoup as bs
  3. import os
  4. from azure.storage.blob import BlobServiceClient, BlobClient
  5. from azure.storage.blob import ContentSettings, ContainerClient
  6. #Your Connexion String
  7. MY_CONNECTION_STRING = "DefaultEndpointsProtocol************************"
  8. #Your Container Name
  9. MY_IMAGE_CONTAINER = "picture"
  10. #Your local path
  11. LOCAL_IMAGE_PATH = "..\Picture"
  12. #change the url to the one you want to scrape
  13. URL = 'WebSiteURL'
  14. class AzureBlobStorage:
  15. def Scrapp(self):
  16. #create folder with the picture if it doesn't exist
  17. if not os.path.exists('.\Picture'):
  18. os.mkdir('.\Picture')
  19. os.chdir('.\Picture')
  20. #Change the number to begin where you want to start
  21. page_begin = 1
  22. #Change the number to the number of pages you want to scrape
  23. page_end = 230 + 1
  24. #If you want to scrape only one page, change the page_end to page_begin or delete the loop
  25. for page in range(page_begin, page_end):
  26. req = requests.get(URL + str(page))
  27. soup = bs(req.text, 'html.parser')
  28. images = soup.find_all('img')
  29. for images in images:
  30. name = images['src']
  31. alpha = images['src']
  32. link = 'WebSiteURL' + alpha
  33. print(link)
  34. #replace the name of the photo it's better :))
  35. with open(name.replace(' ', '-').replace('/', '').replace('"', "'").replace('.jpg','') + '.jpg','wb') as f:
  36. im = requests.get(link)
  37. f.write(im.content)
  38. #check the name on the terminal
  39. print('Writing: ', name)
  40. def __init__(self):
  41. # Initialize the connection to Azure storage account
  42. self.blob_service_client = BlobServiceClient.from_connection_string(MY_CONNECTION_STRING)
  43. def upload_all_images_in_folder(self):
  44. # Get all files with jpg extension and exclude directories
  45. all_file_names = [f for f in os.listdir(LOCAL_IMAGE_PATH)
  46. if os.path.isfile(os.path.join(LOCAL_IMAGE_PATH, f)) and ".jpg" in f]
  47. # Upload each file
  48. for file_name in all_file_names:
  49. self.upload_image(file_name)
  50. def upload_image(self, file_name):
  51. # Create blob with same name as local file name
  52. blob_client = self.blob_service_client.get_blob_client(container=MY_IMAGE_CONTAINER,
  53. blob=file_name)
  54. # Get full path to the file
  55. upload_file_path = os.path.join(LOCAL_IMAGE_PATH, file_name)
  56. # Create blob on storage
  57. # Overwrite if it already exists!
  58. image_content_setting = ContentSettings(content_type='image/jpeg')
  59. print(f"uploading file - {file_name}")
  60. with open(upload_file_path, "rb") as data:
  61. blob_client.upload_blob(data, overwrite=True, content_settings=image_content_setting)
  62. def upload_all_images_in_folder(self):
  63. # Get all files with jpg extension and exclude directories
  64. all_file_names = [f for f in os.listdir(LOCAL_IMAGE_PATH)
  65. if os.path.isfile(os.path.join(LOCAL_IMAGE_PATH, f)) and ".jpg" in f]
  66. # Upload each file
  67. for file_name in all_file_names:
  68. self.upload_image(file_name)
  69. if __name__=='__main__':
  70. # Initialize class and upload files
  71. azure_blob_file_uploader = AzureBlobStorage()
  72. azure_blob_file_uploader.Scrapp()
  73. azure_blob_file_uploader.upload_all_images_in_folder()