LiuFan
/
PrivacyScanData


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334
							from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from crest.helper.utils import read_file
stop_words = set(stopwords.words('english'))
import string

def is_whitespace(c, use_space=True):
    if (c == " " and use_space) or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

def compact_text(paragraph_text):
    doc_tokens = []
    prev_is_whitespace = True
    for c in paragraph_text:
        if is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(' ')
                doc_tokens.append(c)
                prev_is_whitespace = False
            else:
                doc_tokens.append(c)
                prev_is_whitespace = False
    return ''.join(doc_tokens)

# removes punctutions and stop words from the text
def normalize_text(text):
    state = text.lower()
    out = state.translate(str.maketrans('', '', string.punctuation))
    out = word_tokenize(out)
    s_ws = [w for w in out if not w in stop_words]
    return s_ws