1234567891011121314151617181920212223242526272829 |
- def download_unknowns(url: str) -> None:
- """."""
- page_content: bytes = get_none_soup(url)
- page_string: bytes = page_content[0:100]
- """parse section of page bytes and use as name. If unknown encoding
- convert to number string (exclude first few bytes that state filetype) """
- try:
- page_unicode = page_string.decode("ISO-8859-1").replace(R'%', '_')
- page_parsed = [char for char in page_unicode if char.isalnum() or char == '_']
- unknown_file_name = "".join(page_parsed)[10:30]
- except UnicodeDecodeError:
- try:
- page_unicode = page_string.decode('utf-8').replace(R'%', '_')
- page_parsed = [char for char in page_unicode if char.isalnum() or char == '_']
- unknown_file_name = "".join(page_parsed)[10:30]
- except UnicodeDecodeError:
- unknown_file_name = "unk_"
- for char in page_content[10:30]:
- if char != b'\\':
- unknown_file_name += str(char)
- print(unknown_file_name)
- """check beginning of page bytes for a filetype"""
- if b'%PDF' in page_string: # ;
- extension = '.pdf'
- else:
- extension = '.unk.txt'
- with open(save_file_dir + '/' + unknown_file_name + extension, 'wb') as file:
- file.write(page_content)
|