redact_urls_1.py 568 B

123456789101112131415161718
  1. def urls(df):
  2. pattern1 = r'^https?:\/\/.*[\r\n]*'
  3. pattern2 = r'(?:http://)?\w+\.\S*[^.\s]'
  4. df = df.apply(lambda text: re.sub(pattern1, 'xxx', text, flags=re.MULTILINE))
  5. df = df.apply(lambda text: re.sub(pattern2, 'xxx', text, flags=re.MULTILINE))
  6. return pd.Series(df)
  7. ############################################ Test
  8. #################################################
  9. # #
  10. # Texts = [
  11. # 'http://url.com','http://www.url.com/',
  12. # 'https://url.com/bla3/blah3/', 'www.google.com'
  13. # ]
  14. # print(urls(pd.Series(Texts)))