123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- import requests
- from bs4 import BeautifulSoup as bs
- import os
- from azure.storage.blob import BlobServiceClient, BlobClient
- from azure.storage.blob import ContentSettings, ContainerClient
- #Your Connexion String
- MY_CONNECTION_STRING = "DefaultEndpointsProtocol************************"
- #Your Container Name
- MY_IMAGE_CONTAINER = "picture"
- #Your local path
- LOCAL_IMAGE_PATH = "..\Picture"
- #change the url to the one you want to scrape
- URL = 'WebSiteURL'
- class AzureBlobStorage:
- def Scrapp(self):
- #create folder with the picture if it doesn't exist
- if not os.path.exists('.\Picture'):
- os.mkdir('.\Picture')
- os.chdir('.\Picture')
- #Change the number to begin where you want to start
- page_begin = 1
- #Change the number to the number of pages you want to scrape
- page_end = 230 + 1
- #If you want to scrape only one page, change the page_end to page_begin or delete the loop
- for page in range(page_begin, page_end):
- req = requests.get(URL + str(page))
- soup = bs(req.text, 'html.parser')
- images = soup.find_all('img')
- for images in images:
- name = images['src']
- alpha = images['src']
- link = 'WebSiteURL' + alpha
- print(link)
- #replace the name of the photo it's better :))
- with open(name.replace(' ', '-').replace('/', '').replace('"', "'").replace('.jpg','') + '.jpg','wb') as f:
- im = requests.get(link)
- f.write(im.content)
- #check the name on the terminal
- print('Writing: ', name)
- def __init__(self):
- # Initialize the connection to Azure storage account
- self.blob_service_client = BlobServiceClient.from_connection_string(MY_CONNECTION_STRING)
- def upload_all_images_in_folder(self):
- # Get all files with jpg extension and exclude directories
- all_file_names = [f for f in os.listdir(LOCAL_IMAGE_PATH)
- if os.path.isfile(os.path.join(LOCAL_IMAGE_PATH, f)) and ".jpg" in f]
- # Upload each file
- for file_name in all_file_names:
- self.upload_image(file_name)
- def upload_image(self, file_name):
- # Create blob with same name as local file name
- blob_client = self.blob_service_client.get_blob_client(container=MY_IMAGE_CONTAINER,
- blob=file_name)
- # Get full path to the file
- upload_file_path = os.path.join(LOCAL_IMAGE_PATH, file_name)
- # Create blob on storage
- # Overwrite if it already exists!
- image_content_setting = ContentSettings(content_type='image/jpeg')
- print(f"uploading file - {file_name}")
- with open(upload_file_path, "rb") as data:
- blob_client.upload_blob(data, overwrite=True, content_settings=image_content_setting)
- def upload_all_images_in_folder(self):
- # Get all files with jpg extension and exclude directories
- all_file_names = [f for f in os.listdir(LOCAL_IMAGE_PATH)
- if os.path.isfile(os.path.join(LOCAL_IMAGE_PATH, f)) and ".jpg" in f]
- # Upload each file
- for file_name in all_file_names:
- self.upload_image(file_name)
- if __name__=='__main__':
-
- # Initialize class and upload files
- azure_blob_file_uploader = AzureBlobStorage()
- azure_blob_file_uploader.Scrapp()
- azure_blob_file_uploader.upload_all_images_in_folder()
|