utils_36.py 1.3 KB

1234567891011121314151617181920212223242526272829
  1. def download_unknowns(url: str) -> None:
  2. """."""
  3. page_content: bytes = get_none_soup(url)
  4. page_string: bytes = page_content[0:100]
  5. """parse section of page bytes and use as name. If unknown encoding
  6. convert to number string (exclude first few bytes that state filetype) """
  7. try:
  8. page_unicode = page_string.decode("ISO-8859-1").replace(R'%', '_')
  9. page_parsed = [char for char in page_unicode if char.isalnum() or char == '_']
  10. unknown_file_name = "".join(page_parsed)[10:30]
  11. except UnicodeDecodeError:
  12. try:
  13. page_unicode = page_string.decode('utf-8').replace(R'%', '_')
  14. page_parsed = [char for char in page_unicode if char.isalnum() or char == '_']
  15. unknown_file_name = "".join(page_parsed)[10:30]
  16. except UnicodeDecodeError:
  17. unknown_file_name = "unk_"
  18. for char in page_content[10:30]:
  19. if char != b'\\':
  20. unknown_file_name += str(char)
  21. print(unknown_file_name)
  22. """check beginning of page bytes for a filetype"""
  23. if b'%PDF' in page_string: # ;
  24. extension = '.pdf'
  25. else:
  26. extension = '.unk.txt'
  27. with open(save_file_dir + '/' + unknown_file_name + extension, 'wb') as file:
  28. file.write(page_content)