redact_urls.py 713 B

123456789101112131415161718192021222324
  1. # from urlextract import URLExtract
  2. import re
  3. import pandas as pd
  4. import sys
  5. sys.setrecursionlimit(1500) # after 1500 makes python kernal break
  6. def urls(df):
  7. pattern1 = r'^https?:\/\/.*[\r\n]*'
  8. pattern2 = r'(?:http://)?\w+\.\S*[^.\s]'
  9. df = df.apply(lambda text: re.sub(pattern1, 'xxx', text, flags=re.MULTILINE))
  10. df = df.apply(lambda text: re.sub(pattern2, 'xxx', text, flags=re.MULTILINE))
  11. return pd.Series(df)
  12. ############################################ Test
  13. #################################################
  14. # #
  15. # Texts = [
  16. # 'http://url.com','http://www.url.com/',
  17. # 'https://url.com/bla3/blah3/', 'www.google.com'
  18. # ]
  19. # print(urls(pd.Series(Texts)))