scraping_medium.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. import os
  2. import sys
  3. import requests
  4. import re
  5. from bs4 import BeautifulSoup
  6. # switching to current running python files directory
  7. os.chdir('\\'.join(__file__.split('/')[:-1]))
  8. # function to get the html of the page
  9. def get_page():
  10. global url
  11. url = input('Enter url of a medium article: ')
  12. # handling possible error
  13. if not re.match(r'https?://medium.com/',url):
  14. print('Please enter a valid website, or make sure it is a medium article')
  15. sys.exit(1)
  16. res = requests.get(url)
  17. res.raise_for_status()
  18. soup = BeautifulSoup(res.text, 'html.parser')
  19. return soup
  20. # function to remove all the html tags and replace some with specific strings
  21. def purify(text):
  22. rep = {"<br>": "\n", "<br/>": "\n", "<li>": "\n"}
  23. rep = dict((re.escape(k), v) for k, v in rep.items())
  24. pattern = re.compile("|".join(rep.keys()))
  25. text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
  26. text = re.sub('\<(.*?)\>', '', text)
  27. return text
  28. # function to compile all of the scraped text in one string
  29. def collect_text(soup):
  30. fin = f'url: {url}\n\n'
  31. main = (soup.head.title.text).split('|')
  32. global title
  33. title = main[0].strip()
  34. fin += f'Title: {title.upper()}\n{main[1].strip()}'
  35. header = soup.find_all('h1')
  36. j = 1
  37. try:
  38. fin += '\n\nINTRODUCTION\n'
  39. for elem in list(header[j].previous_siblings)[::-1]:
  40. fin += f'\n{purify(str(elem))}'
  41. except:
  42. pass
  43. fin += f'\n\n{header[j].text.upper()}'
  44. for elem in header[j].next_siblings:
  45. if elem.name == 'h1':
  46. j+=1
  47. fin += f'\n\n{header[j].text.upper()}'
  48. continue
  49. fin += f'\n{purify(str(elem))}'
  50. return fin
  51. # function to save file in the current directory
  52. def save_file(fin):
  53. if not os.path.exists('./scraped_articles'):
  54. os.mkdir('./scraped_articles')
  55. fname = './scraped_articles/' + '_'.join(title.split()) + '.txt'
  56. with open(fname, 'w', encoding='utf8') as outfile:
  57. outfile.write(fin)
  58. print(f'File saved in directory {fname}')
  59. # driver code
  60. if __name__ == '__main__':
  61. fin = collect_text(get_page())
  62. save_file(fin)