Source code for pylyrics2.clean_text

# Authors: Abhiket Gaurav, Artan Zandian, Macy Chan, Manju Abhinandana Kumar
# January 2022
import re
from xmlrpc.client import Boolean


[docs]def clean_text(text, bool_contra_dict=True): """Cleans the text by removing special characters, html_tags, #tags, contaction words and convert everything to lower case. Parameters ---------- text : str Text to clean. bool_contra_dict : Boolean A flag to clear contraction words Returns ------- text Cleaned text. Examples -------- >>> clean_text("Early optimization is the root of all evil!", False) 'early optimization is the root of all evil' """ try: # check input types if type(text) != str: raise TypeError("Text should be a variable of type string.") # check for blank string if len(text.strip()) == 0: raise ValueError("Blank text input") # check for blank string if type(bool_contra_dict) != Boolean: raise TypeError("bool_contra_dict should be a variable of type boolean.") if bool_contra_dict: contra_dict = { "'s": " is", "n't": " not", "'m": " am", "'ll": " will", "'d": " would", "'ve": " have", "'re": " are", } for key, value in contra_dict.items(): if key in text: text = text.replace(key, value) # lower case and remove special characters text = re.sub(r"[^a-zA-Z\s]", "", text, re.I | re.A) text = re.sub(r"https?:\/\/.\S+", "", text) text = re.sub(r"#", "", text) text = text.lower() return text except (TypeError, ValueError) as err: print(err) raise