elemeta.nlp.extractors.high_level package#

Submodules#

elemeta.nlp.extractors.high_level.acronym_count module#

class elemeta.nlp.extractors.high_level.acronym_count.AcronymCount(name: str | None = None)#

Bases: RegexMatchCount

Counts the number of acronyms in the text.

Parameters:

name (Optional[str], optional) – Name to use for the metadata. If not given, the name will be extracted from the class name.

Examples

>>> from elemeta.nlp.extractors.high_level.acronym_count import AcronymCount
>>> text = "W.T.F that was LOL"
>>> counter = AcronymCount()
>>> result = counter(text)
>>> print(result) # Output: 2

Methods

__call__(text)

run self.extract on the given text

extract(text)

Extract the count of matches for the given regex in the text.

elemeta.nlp.extractors.high_level.avg_word_length module#

class elemeta.nlp.extractors.high_level.avg_word_length.AvgWordLength(exclude_list: Set[str] = extended_punctuations, name: str | None = None)#

Bases: AvgTokenLength

Gives the average length of the words in the text.

Parameters:
  • exclude_list (Set[str], optional) – Set of words to exclude when computing the metric. Default is extended_punctuations.

  • name (str, optional) – Name of the metafeature. If not given, it will extract the name from the class name.

Example

>>> from elemeta.nlp.extractors.high_level.avg_word_length import AvgWordLength
>>> text = "Hello, my name is Inigo Montoya. You killed my father. Prepare to die."
>>> avg_word_length = AvgWordLength()
>>> result = avg_word_length(text)
>>> print(result)  # Output: 4.538

Methods

__call__(text)

run self.extract on the given text

extract(text)

return the number of average token length in the text

elemeta.nlp.extractors.high_level.capital_letters_ratio module#

class elemeta.nlp.extractors.high_level.capital_letters_ratio.CapitalLettersRatio(name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Counts the ratio of capital letters to all letters

Parameters:

name (str, optional) – Name of the metafeature. If not given, it will be extracted from the class name.

name#

Name of the metafeature.

Type:

str

extract(text)#

Calculates the ratio of capital letters to lower letters in the given text.

Examples

>>> from elemeta.nlp.extractors.high_level.capital_letters_ratio import CapitalLettersRatio
>>> extractor = CapitalLettersRatio()
>>> text = "HalF Ok"
>>> ratio = extractor.extract(text)
>>> print(ratio) #Output: 0.5

Methods

__call__(text)

run self.extract on the given text

extract(text)

Calculates the ratio of capital letters to all letters in the given text.

extract(text: str) float#

Calculates the ratio of capital letters to all letters in the given text.

Parameters:

text (str) – The text to check the ratio on.

Returns:

The ratio of capital letters to lower letters.

Return type:

float

elemeta.nlp.extractors.high_level.date_count module#

class elemeta.nlp.extractors.high_level.date_count.DateCount(name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Counts the number of dates in the text.

Parameters:

name (Optional[str], optional) – Name of the metafeature. If not given, the name will be extracted from the class name.

name#

Name of the metafeature.

Type:

str

extract(text)#

Return the number of dates in the text.

Examples

>>> date_counter = DateCount()
>>> text = "Entries are due by January 4th, 2017 at 8:00pm, created 01/15/2005 by ACME Inc. and associates."
>>> date_counter(text) #Output: 2

Methods

__call__(text)

run self.extract on the given text

extract(text)

Return the number of dates in the text.

extract(text: str) int#

Return the number of dates in the text.

Parameters:

text (str) – The string to run on.

Returns:

The number of dates in the text.

Return type:

int

elemeta.nlp.extractors.high_level.detect_language_langdetect module#

class elemeta.nlp.extractors.high_level.detect_language_langdetect.DetectLanguage(name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Returns the language of the text.

Parameters:

name (str, optional) – Name of the metafeature. If not given, the name will be extracted from the class name.

extract(text)#

Detects the language of the given text.

Examples

>>> detect_language = DetectLanguage()
>>> text = "I love cakes. Its the best. Almost like the rest"
>>> language = detect_language(text)
>>> print(language) #Output: 'en'

Methods

__call__(text)

run self.extract on the given text

extract(text)

Detects the language of the given text.

extract(text: str) str#

Detects the language of the given text.

Parameters:

text (str) – The text to detect the language on.

Returns:

The most likely language of the text.

Return type:

str

elemeta.nlp.extractors.high_level.email_count module#

class elemeta.nlp.extractors.high_level.email_count.EmailCount(name: str | None = None)#

Bases: RegexMatchCount

Counts the number of emails in the text.

Parameters:

name (Optional[str], optional) – Name to use for the metafeature. If not given, the name will be extracted from the class name.

Examples

>>> email_counter = EmailCount(name="email_count")
>>> text = "lior.something@gmail.ac.il is ok but lior@superwise.il is better"
>>> count = email_counter(text)
>>> print(count) #Output: 2

Methods

__call__(text)

run self.extract on the given text

extract(input)

Extract the count of matches for the given regex in the text.

extract(input: str) int#

Extract the count of matches for the given regex in the text.

Parameters:

text (str) – The text to run the regex on.

Returns:

The number of times the regex is found in the string.

Return type:

int

elemeta.nlp.extractors.high_level.embedding module#

class elemeta.nlp.extractors.high_level.embedding.Embedding(embedding_model: str | None = 'all-MiniLM-L6-v2', modules: Iterable[Module] | None = None, device: str | None = None, cache_folder: str | None = None, use_auth_token: bool | str | None = None, name: str | None = None)#

Bases: AbstractMetafeatureExtractor

Extracts embeddings from a text using a SentenceTransformer model.

Parameters:
  • embedding_model (Optional[str]) – The name of the SentenceTransformer model to use, by default “all-MiniLM-L6-v2”

  • modules (Optional[Iterable[nn.Module]]) – This parameter can be used to create custom SentenceTransformer models from scratch.

  • device (Optional[str]) – Device (like ‘cuda’ / ‘cpu’) that should be used for computation. If None, checks if a GPU can be used.

  • cache_folder (Optional[str]) – Path to store models

  • use_auth_token (Union[bool, str, None]) – HuggingFace authentication token to download private models.

  • name (Optional[str]) – Name of the extractor

Examples

>>> embed = Embedding(embedding_model="all-MiniLM-L6-v2")
>>> text = "NLP"
>>> embedding = embed(text)

Methods

__call__(input)

run self.extract on the given text

extract(input[, convert_to_tensor])

Extracts embeddings from a text using a SentenceTransformer model.

extract(input: str | List[str], convert_to_tensor: bool = True) List[Tensor] | ndarray | Tensor#

Extracts embeddings from a text using a SentenceTransformer model.

Parameters:
  • input (Union[str, List[str]]) – Text or list of texts to extract embeddings from.

  • convert_to_tensor (bool) – Whether to convert the output to a tensor or keep it as a numpy array.

Returns:

Embeddings of the input text(s).

Return type:

Union[List[Tensor], ndarray, Tensor]

Examples

>>> embedding = Embedding(embedding_model="all-MiniLM-L6-v2")
>>> text = "This is a sample sentence."
>>> embeddings = embedding.extract(text)
>>> print(embeddings)
[[-0.123, 0.456, ...]]

elemeta.nlp.extractors.high_level.emoji_count module#

class elemeta.nlp.extractors.high_level.emoji_count.EmojiCount(name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Counts the number of emojis in the text.

Parameters:

name (str, optional) – Name of the metafeature. If not given, the name will be extracted from the class name.

name#

Name of the metafeature.

Type:

str

extract(text)#

Counts the number of emojis in the given text.

Examples

>>> emoji_counter = EmojiCount()
>>> text = "🤔 word 🙈 text 😌 ."
>>> num_emojis = emoji_counter(text)
>>> print(num_emojis) #Output: 3

Methods

__call__(text)

run self.extract on the given text

extract(text)

Counts the number of emojis in the given text.

extract(text: str) int#

Counts the number of emojis in the given text.

Parameters:

text (str) – The text to count emojis on.

Returns:

The number of emojis in the text.

Return type:

int

elemeta.nlp.extractors.high_level.hashtag_count module#

class elemeta.nlp.extractors.high_level.hashtag_count.HashtagCount(name: str | None = None)#

Bases: RegexMatchCount

Counts the number of hashtags in the text.

Parameters:

name (Optional[str], optional) – Name to use for the metafeature. If not given, the name will be extracted from the class name.

Examples

>>> text = "I love #programming and #coding!"
>>> hastag_counter = HashtagCount()
>>> count = hastag_counter(text)
>>> print(count) #Output: 2

Methods

__call__(text)

run self.extract on the given text

extract(text)

Extract the count of matches for the given regex in the text.

elemeta.nlp.extractors.high_level.hinted_profanity_sentence_count module#

class elemeta.nlp.extractors.high_level.hinted_profanity_sentence_count.HintedProfanitySentenceCount(name: str | None = None)#

Bases: HintedProfanityTokensCount

Counts the number of sentences with profanity words in them uses better_profanity library. snguyenthanh/better_profanity

Parameters:

name (str, optional) – Name of the metadata. If not given, the name will be extracted from the class name.

Examples

>>> profanity_counter = HintedProfanitySentenceCount()
>>> text = "Fuck this sh!t. I want to fucking leave the country, but I am fine"
>>> profanity_count = profanity_counter(text)
>>> print(profanity_count) #Output: 1

Methods

__call__(text)

run self.extract on the given text

extract(text)

return the number of profanity words in the text

elemeta.nlp.extractors.high_level.hinted_profanity_words_count module#

class elemeta.nlp.extractors.high_level.hinted_profanity_words_count.HintedProfanityWordsCount(name: str | None = None)#

Bases: HintedProfanityTokensCount

Counts the number of profanity words (uses better_profanity better_profanity library). snguyenthanh/better_profanity

Parameters:

name (str, optional) – The name of the metafeature. If not given, it will be extracted from the class name.

Examples

>>> from elemeta.nlp.extractors.high_level.hinted_profanity_words_count import HintedProfanityWordsCount
>>> profanity_word_counter = HintedProfanityWordsCount()
>>> text = "Fuck this sh!t. I want to fucking leave the country"
>>> count = profanity_word_counter(text)
>>> print(count) #Output: 3

Methods

__call__(text)

run self.extract on the given text

extract(text)

return the number of profanity words in the text

elemeta.nlp.extractors.high_level.mention_count module#

class elemeta.nlp.extractors.high_level.mention_count.MentionCount(name: str | None = None)#

Bases: RegexMatchCount

Counts the number of mentions (word in the format @someones_name)

Parameters:

name (Optional[str], optional) – Name to use for the metafeature. If not given, the name will be extracted from the class name.

Examples

>>> from elemeta.nlp.extractors.high_level.mention_count import MentionCount
>>> mention_counter = MentionCount()
>>> count = mention_counter("Hello @JohnDoe, how are you?")
>>> print(count) #Output: 1

Methods

__call__(text)

run self.extract on the given text

extract(text)

Extract the count of matches for the given regex in the text.

elemeta.nlp.extractors.high_level.must_appear_words_percentage module#

class elemeta.nlp.extractors.high_level.must_appear_words_percentage.MustAppearWordsPercentage(must_appear: Set[str], name: str | None = None)#

Bases: MustAppearTokensPercentage

For a given set of words, return the percentage of words that appeared in the text

Parameters:
  • must_appear (set of str) – Set of words that must appear in the text.

  • name (str, optional) – Name of the metafeature. If not given, the name will be extracted from the class name.

Examples

>>> from elemeta.nlp.extractors.high_level.must_appear_words_percentage import MustAppearWordsPercentage
>>> text = "I am good now"
>>> calc_word_precentage = MustAppearWordsPercentage(must_appear={"I", "am"})
>>> percentage = calc_word_precentage(text)
>>> print(percentage) #Output: 1

Methods

__call__(text)

run self.extract on the given text

extract(text)

gives the percentage of the tokens in must_appear set that appeared in the text

elemeta.nlp.extractors.high_level.ner_identifier module#

class elemeta.nlp.extractors.high_level.ner_identifier.NER_Identifier(name: str | None = None, path: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Identifies any potential PII mentioned in a text.

Parameters:
  • name (str, optional) – Name of the metafeature. If not given, the name will be extracted from the class name.

  • path (str, optional) – The path used for the model. If not given, defaults to: https://huggingface.co/dslim/bert-base-NER

model_path#

The path to the NER model.

Type:

str

extract(text)#

Detects NER from a text.

Examples

>>> from elemeta.nlp.extractors.high_level.ner_identifier import NER_Identifier
>>> ner_identifier = NER_Identifier()
>>> text = "John Doe works at ABC Corp in New York."
>>> result = ner_identifier.extract(text)
>>> print(result)
{
    'B-PER': ['John'],
    'I-PER': ['Do', '##e'],
    'B-ORG': ['ABC'],
    'I-ORG': ['Corp'],
    'B-LOC': ['New'],
    'I-LOC': ['York']
}

Methods

__call__(text)

run self.extract on the given text

extract(text)

Detects NER from a text.

extract(text: str) Dict[str, List[str]]#

Detects NER from a text.

Parameters:

text (str) – The string to run the NER on.

Returns:

A dictionary where the keys represent the NER tags and the values are lists of associated words. B-MIS Beginning of a miscellaneous entity right after another miscellaneous entity I-MIS Miscellaneous entity B-PER Beginning of a person’s name right after another person’s name I-PER Person’s name B-ORG Beginning of an organization right after another organization I-ORG organization B-LOC Beginning of a location right after another location I-LOC Location and the value are associated information extracted from the text

Return type:

Dict[str, List[str]]

elemeta.nlp.extractors.high_level.number_count module#

class elemeta.nlp.extractors.high_level.number_count.NumberCount(name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Counts the number of numbers in the text.

Parameters:

name (str, optional) – Name of the metafeature. If not given, the name will be extracted from the class name.

Examples

>>> from elemeta.nlp.extractors.high_level.number_count import NumberCount
>>> number_counter = NumberCount()
>>> text = "There are 3 apples and 5 oranges."
>>> number_counter(text) #Output: 2

Methods

__call__(text)

run self.extract on the given text

extract(text)

Return the number of numbers in the text.

validator(token)

Number check validator.

extract(text: str) int#

Return the number of numbers in the text.

Parameters:

text (str) – The string to run on.

Returns:

The number of numbers in the text.

Return type:

int

validator(token: str) bool#

Number check validator. Checks if the token is a number.

Parameters:

token (str) – The token to check if it is a number.

Returns:

True if the token is a number.

Return type:

bool

elemeta.nlp.extractors.high_level.out_of_vocabulary_count module#

class elemeta.nlp.extractors.high_level.out_of_vocabulary_count.OutOfVocabularyCount(vocabulary: Set[str] | None = None, name: str | None = None)#

Bases: TokensCount

For a given vocabulary (the default is English vocabulary taken from nltk.corpus) return the number of words outside of the vocabulary

Example

>>> from elemeta.nlp.extractors.high_level.out_of_vocabulary_count import OutOfVocabularyCount
>>> text = "Rick said Wubba Lubba dub-dub"
>>> oov_counter = OutOfVocabularyCount()
>>> print(oov_counter(text)) #Output: 3

Methods

__call__(text)

run self.extract on the given text

extract(text)

counts the number tokens in the text

extract(text: str) int#

counts the number tokens in the text

Parameters:

text (str) – the text to check appearance on

Returns:

the number of appearance of a must-appear word list

Return type:

int

elemeta.nlp.extractors.high_level.pii_identify module#

class elemeta.nlp.extractors.high_level.pii_identify.PII_Identify(name: str | None = None, pii: List[str] | None = None)#

Bases: AbstractTextMetafeatureExtractor

Identifies any potential Named Entity Recognitions (NERs) mentioned in a text.

Parameters:
  • name (str, optional) – Name of the metafeature. If not given, it will be extracted from the class name.

  • pii (list of str, optional) – List of specific Personally Identifiable Information (PII) to narrow down the analyzer. If none or an unsupported PII is given, the default is to search for all. Supported entities can be found at: https://microsoft.github.io/presidio/supported_entities/

Examples

>>> from elemeta.nlp.extractors.high_level.pii_identify import PII_Identify
>>> pii = PII_Identify()
>>> text = "My email address is john.doe@example.com and my phone number is 123-456-7890."
>>> result = pii(text)
>>> print(result)
{'EMAIL_ADDRESS': ['john.doe@example.com'], 'PHONE_NUMBER': ['123-456-7890'], 'URL': ['john.do', 'example.com']}

Methods

__call__(text)

run self.extract on the given text

extract(text)

Detects Named Entity Recognitions (NERs) from a text.

extract(text: str) Dict[str, List[str]]#

Detects Named Entity Recognitions (NERs) from a text.

Parameters:

text (str) – The string to run the analysis on.

Returns:

A dictionary of the identified PII from the text, where the keys are the type of PII and the values are lists of all analyzed PII of that type.

Return type:

Dict[str, List[str]]

Examples

>>> from elemeta.nlp.extractors.high_level.pii_identify import PII_Identify
>>> extractor = PII_Identify()
>>> text = "My email address is john.doe@example.com and my phone number is 123-456-7890."
>>> result = extractor.extract(text)
>>> print(result)
{'EMAIL_ADDRESS': ['john.doe@example.com'], 'PHONE_NUMBER': ['123-456-7890'], 'URL': ['john.do', 'example.com']}

elemeta.nlp.extractors.high_level.punctuation_count module#

class elemeta.nlp.extractors.high_level.punctuation_count.PunctuationCount(punctuations: Set[str] = extended_punctuations, name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Counts the number of punctuation marks in the text

Example

>>> from elemeta.nlp.extractors.high_level.punctuation_count import PunctuationCount
>>> text = "Once I was afraid, I was petrified!"
>>> punctuation_count = PunctuationCount()
>>> result = punctuation_count(text)
>>> print(result)  # Output: 2

Methods

__call__(text)

run self.extract on the given text

extract(text)

return the number of punctuations in the text

extract(text: str) int#

return the number of punctuations in the text

Parameters:

text (str) – the string to run on

Returns:

the number of punctuations in the text

Return type:

int

elemeta.nlp.extractors.high_level.regex_match_count module#

class elemeta.nlp.extractors.high_level.regex_match_count.RegexMatchCount(regex: str = '.+', name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

For a given regex, return the number of matches it has in the text.

Parameters:
  • regex (str) – The regular expression pattern to match.

  • name (Optional[str], optional) – The name of the metafeature. If not given, the name will be extracted from the class name.

Examples

>>> digit_counter = RegexMatchCount(regex=r'\d', name='Digit Count')
>>> text = 'There are 3 apples and 52 oranges.'
>>> digit_counter(text) #Output: 3

Methods

__call__(text)

run self.extract on the given text

extract(text)

Extract the count of matches for the given regex in the text.

extract(text: str) int#

Extract the count of matches for the given regex in the text.

Parameters:

text (str) – The text to run the regex on.

Returns:

The number of times the regex is found in the string.

Return type:

int

elemeta.nlp.extractors.high_level.semantic_text_pair_similarity module#

class elemeta.nlp.extractors.high_level.semantic_text_pair_similarity.SemanticTextPairSimilarity(embedding_model: str | None = None, modules: Iterable[Module] | None = None, device: str | None = None, cache_folder: str | None = None, use_auth_token: bool | str | None = None, name: str | None = None)#

Bases: AbstractTextPairMetafeatureExtractor

Returns the similarity of two texts

Examples

>>> pair_similarity = SemanticTextPairSimilarity()
>>> print(pair_similarity("I love cats", "I love dogs")) #Output: 0.7720986008644104
>>> print(pair_similarity("Hi","Bye")) #Output: 0.36858582496643066

Methods

__call__(input_1, input_2)

run self.extract on the given text

extract(input_1, input_2)

Extracts the similarity between two texts

extract(input_1: str, input_2: str) float#

Extracts the similarity between two texts

Parameters:
  • input_1 (str) – first text

  • input_2 (str) – second text

Returns:

similarity between the two texts

Return type:

float

elemeta.nlp.extractors.high_level.sentence_avg_length module#

class elemeta.nlp.extractors.high_level.sentence_avg_length.SentenceAvgLength(name: str | None = None)#

Bases: AvgTokenLength

Gives the average length of sentences in the text

Example

>>> from elemeta.nlp.extractors.high_level.sentence_avg_length import SentenceAvgLength
>>> texy = "Hello, my name is Inigo Montoya. You killed my father. Prepare to die."
>>> sentence_avg_length = SentenceAvgLength()
>>> result = sentence_avg_length(text)
>>> print(result)  # Output: 22.66668

Methods

__call__(text)

run self.extract on the given text

extract(text)

return the number of average token length in the text

elemeta.nlp.extractors.high_level.sentence_count module#

class elemeta.nlp.extractors.high_level.sentence_count.SentenceCount(name: str | None = None)#

Bases: TokensCount

Counts the number of sentences in the text

Example

>>> from elemeta.nlp.extractors.high_level.sentence_count import SentenceCount
>>> text = "Hello, my name is Inigo Montoya. You killed my father. Prepare to die."
>>> sentence_count = SentenceCount()
>>> result = sentence_count(text)
>>> print(result)  # Output: 3

Methods

__call__(text)

run self.extract on the given text

extract(text)

counts the number tokens in the text

elemeta.nlp.extractors.high_level.sentiment_polarity module#

class elemeta.nlp.extractors.high_level.sentiment_polarity.SentimentPolarity(name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Returns the Sentiment Polarity (read more about the difference between sentiment polarity and sentiment subjectivity here: https://www.tasq.ai/tasq-question/what-are-polarity-and-subjectivity-in-sentiment-analysis/) value as a range between -1 to 1, where -1 means the text is an utterly negative sentiment and 1 is an utterly positive sentiment.

Example

>>> from elemeta.nlp.extractors.high_level.sentiment_polarity import SentimentPolarity
>>> sentiment_polarity = SentimentPolarity()
>>> print(sentiment_polarity("I love cake!")) #Output: 0.669
>>> print(sentiment_polarity("I HATE cake!")) #Output: -0.693

Methods

__call__(text)

run self.extract on the given text

extract(text)

sentiment analysis prediction function

extract(text: str) float#

sentiment analysis prediction function

Parameters:

text (str) – the text we want sentiment analysis to run on

Returns:

sentiment – between 0 and 1 representing the sentiment. 0 negative, 1 positive

Return type:

float

elemeta.nlp.extractors.high_level.sentiment_subjectivity module#

class elemeta.nlp.extractors.high_level.sentiment_subjectivity.SentimentSubjectivity(name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Returns the Sentiment Subjectivity (read more about the difference between sentiment polarity and sentiment subjectivity here:https://www.tasq.ai/tasq-question/what-are-polarity-and-subjectivity-in-sentiment-analysis/) value as a range between 0 to 1, where 0.0 is utterly objective, and 1.0 is utterly subjective.

Example

>>> from elemeta.nlp.extractors.high_level.sentiment_subjectivity import SentimentSubjectivity
>>> sentiment_subjectivity = SentimentSubjectivity()
>>> print(sentiment_subjectivity("I hate cakes!")) #Output: 0.9
>>> print(sentiment_subjectivity("They all failed the test")) #Output: 0.3

Methods

__call__(text)

run self.extract on the given text

extract(text)

sentiment subjectivity prediction function

extract(text: str) float#

sentiment subjectivity prediction function

Parameters:

text (str) – the text we want sentiment subjectivity to run on

Returns:

  • sentiment (float) – return subjectivity score as a float within the range [0.0, 1.0]

  • where 0.0 is very objective and 1.0 is very subjective.

elemeta.nlp.extractors.high_level.special_chars_count module#

class elemeta.nlp.extractors.high_level.special_chars_count.SpecialCharsCount(specials: Set[str] = special_chars, name: str | None = None)#

Bases: TokensCount

Counts the number of special characters in the .

Example

>>> from elemeta.nlp.extractors.high_level.special_chars_count import SpecialCharsCount
>>> text = "Once I was afraid, I was petrified!"
>>> special_chars_count = SpecialCharsCount()
>>> result = special_chars_count(text)
>>> print(result)  # Output: 1

Methods

__call__(text)

run self.extract on the given text

extract(text)

counts the number tokens in the text

elemeta.nlp.extractors.high_level.stop_words_count module#

class elemeta.nlp.extractors.high_level.stop_words_count.StopWordsCount(name: str | None = None)#

Bases: TokensCount

Counts the number of stop words.

Example

>>> from elemeta.nlp.extractors.high_level.stop_words_count import StopWordsCount
>>> text = "Once I was afraid, I was petrified"
>>> stop_words_count = StopWordsCount()
>>> result = stop_words_count(text)
>>> print(result)  # Output: 4

Methods

__call__(text)

run self.extract on the given text

extract(text)

counts the number tokens in the text

elemeta.nlp.extractors.high_level.syllable_count module#

class elemeta.nlp.extractors.high_level.syllable_count.SyllableCount(name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Counts the total number of syllables in the text.

Example

>>> from elemeta.nlp.extractors.high_level.syllable_count import SyllableCount
>>> syllable_count = SyllableCount()
>>> print(syllable_count("hyperemotionality"))  # Output: 8

Methods

__call__(text)

run self.extract on the given text

extract(text)

This function will extract the metric from the text :param text: :type text: str

extract(text: str) int#

This function will extract the metric from the text :param text: :type text: str

Returns:

the metadata extracted from text

Return type:

Any

elemeta.nlp.extractors.high_level.text_complexity module#

class elemeta.nlp.extractors.high_level.text_complexity.TextComplexity(metric: Callable[[str], float] = textstat.textstat.flesch_reading_ease, name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Return the Flesch Reading Ease Score of the text

Example

>>> from elemeta.nlp.extractors.high_level.text_complexity import TextComplexity
>>> text_complexity = TextComplexity()
>>> print(text_complexity("This love cakes"))  # Output: 119.19
>>> print(text_complexity("Production of biodiesel by enzymatic transesterifcation of non-edible Salvadora persica (Pilu) oil and crude coconut oil in a solvent-free system"))  # Output: 17.34

Methods

__call__(text)

run self.extract on the given text

extract(text)

This function will extract the metric from the text :param text: :type text: str

extract(text: str) float#

This function will extract the metric from the text :param text: :type text: str

Returns:

the metadata extracted from text

Return type:

Any

elemeta.nlp.extractors.high_level.text_length module#

class elemeta.nlp.extractors.high_level.text_length.TextLength(name: str | None = None)#

Bases: AbstractTextMetafeatureExtractor

Gives the number of characters in the text (including whitespace).

Methods

__call__(text)

run self.extract on the given text

extract(text)

text length counter returns the length of the text

extract(text: str) int#

text length counter returns the length of the text

Parameters:

text (str) – the text to check length on

Returns:

the length of the text

Return type:

int

elemeta.nlp.extractors.high_level.toxicity_extractor module#

class elemeta.nlp.extractors.high_level.toxicity_extractor.ToxicityExtractor(name: str | None = None, tokenizer: Callable = DEFAULT_TOKENIZER, aggregate: Callable = min)#

Bases: AbstractTextMetafeatureExtractor

measures toxicity of a given text.

Example

>>> from elemeta.nlp.extractors.high_level.toxicity_extractor import ToxicityExtractor
>>> text = "Once I was afraid, I was petrified"
>>> toxicity_extractor = ToxicityExtractor()
>>> result = toxicity_extractor(text)
>>> print(result)  # Output: 0.000

Methods

__call__(text)

run self.extract on the given text

extract(text)

returns a float representing how toxic a piece of text is

extract(text: str) float#

returns a float representing how toxic a piece of text is

Parameters:

text (str) – the string to run on

Returns:

a float closer to one is more toxic, closer to zero is non toxic.

Return type:

float

elemeta.nlp.extractors.high_level.unique_word_count module#

class elemeta.nlp.extractors.high_level.unique_word_count.UniqueWordCount(exceptions: Set[str] = english_punctuations, name: str | None = None)#

Bases: UniqueTokenCount

Currently returns the number of words in the text that appear exactly once, will change to count the unique words in the text

Example

>>> from elemeta.nlp.extractors.high_level.unique_word_count import UniqueWordCount
>>> text = "Once I was afraid, I was petrified"
>>> unique_word_count = UniqueWordCount()
>>> result = unique_word_count(text)
>>> print(result)  # Output: 3

Methods

__call__(text)

run self.extract on the given text

extract(text)

counts the number tokens in the text

elemeta.nlp.extractors.high_level.unique_word_ratio module#

class elemeta.nlp.extractors.high_level.unique_word_ratio.UniqueWordRatio(exceptions: Set[str] = english_punctuations, name: str | None = None)#

Bases: UniqueTokensRatio

Gives the ratio between the number of distinct words (total number of different values regardless how many times it appears in the dataset) to the number of unique words (total number of values that only appear once in the dataset).

Example

>>> from elemeta.nlp.extractors.high_level.unique_word_ratio import UniqueWordRatio
>>> text = "I love to move it move it"
>>> unique_word_ratio = UniqueWordRatio()
>>> result = unique_word_ratio(text)
>>> print(result)  # Output: 0.6

Methods

__call__(text)

run self.extract on the given text

extract(text)

Unique words in text function

elemeta.nlp.extractors.high_level.word_count module#

class elemeta.nlp.extractors.high_level.word_count.WordCount(exclude_tokens_list: Set[str] = extended_punctuations, name: str | None = None)#

Bases: TokensCount

Gives the number of words in the text.

Example

>>> from elemeta.nlp.extractors.high_level.word_count import WordCount
>>> text = "I love to move it move it"
>>> word_count = WordCount()
>>> result = word_count(text)
>>> print(result)  # Output: 7

Methods

__call__(text)

run self.extract on the given text

extract(text)

counts the number tokens in the text

elemeta.nlp.extractors.high_level.word_regex_matches_count module#

class elemeta.nlp.extractors.high_level.word_regex_matches_count.WordRegexMatchesCount(regex: str = '.*', name: str | None = None)#

Bases: TokenRegexMatchesCount

For a given regex return the number of words matching the regex

Example

>>> from elemeta.nlp.extractors.high_level.word_regex_matches_count import WordRegexMatchesCount
>>> text = "he hee is"
>>> regex = "h.+"
>>> word_regex_matches_counter = WordRegexMatchesCount(regex=regex)
>>> result = word_regex_matches_counter(text)
>>> print(result)  # Output: 2

Methods

__call__(text)

run self.extract on the given text

extract(text)

return the number of matches of the given regex in the text

validator(token)

regex check validator checks if the token abides by the regex

Module contents#