Custom Extractor#

It is possible to create your own MetadataExtractor to fit you needs.

You can do this by implementing the abstract class AbstractMetadataExtractor

>>> from elemeta.nlp.extractors.low_level.abstract_text_metafeature_extractor import AbstractTextMetafeatureExtractor

Let’s create IsPalindromeExtractor that will return if the given text is palindrome:

Let’s create IsPalindromeExtractor that will return if the given text is palindrome:

>>> class IsPalindromeExtractor(AbstractTextMetafeatureExtractor):
...     def extract(self, text: str) -> bool:
...         normalized_text = text.replace(" ", "").lower()
...         return normalized_text == normalized_text[::-1]
>>> ipe = IsPalindromeExtractor()

Let’s test it:

>>> ipe("cat")
False
>>> ipe("taco cat")
True

Now you can easily use it and add it your MetadataExtractorRunner

>>> from elemeta.nlp.runners.metafeature_extractors_runner import MetafeatureExtractorsRunner
>>> metafeature_extractors_runner = MetafeatureExtractorsRunner()
>>> metafeature_extractors_runner.add_metafeature_extractor(ipe)
>>> metafeature_extractors_runner.run("Never odd or even")
{'detect_language': 'en',
 'emoji_count': 0,
 'text_complexity': 92.8,
 'unique_word_ratio': 1.0,
 'unique_word_count': 4,
 'word_regex_matches_count': 4,
 'number_count': 0,
 'out_of_vocabulary_count': 1,
 'must_appear_words_ratio': 0,
 'sentence_count': 1,
 'sentence_avg_length': 17.0,
 'word_count': 4,
 'avg_word_length': 3.5,
 'text_length': 17,
 'stop_words_count': 1,
 'punctuation_count': 0,
 'special_chars_count': 0,
 'capital_letters_ratio': 0.07142857142857142,
 'regex_match_count': 1,
 'email_count': 0,
 'link_count': 0,
 'hashtag_count': 0,
 'mention_count': 0,
 'syllable_count': 5,
 'acronym_count': 0,
 'date_count': 0,
 'is_palindrome_extractor': True}

>>> metafeature_extractors_runner.run("I love cats")
{'detect_language': 'ca',
 'emoji_count': 0,
 'text_complexity': 119.19,
 'unique_word_ratio': 1.0,
 'unique_word_count': 3,
 'word_regex_matches_count': 3,
 'number_count': 0,
 'out_of_vocabulary_count': 1,
 'must_appear_words_ratio': 0,
 'sentence_count': 1,
 'sentence_avg_length': 11.0,
 'word_count': 3,
 'avg_word_length': 3.0,
 'text_length': 11,
 'stop_words_count': 0,
 'punctuation_count': 0,
 'special_chars_count': 0,
 'capital_letters_ratio': 0.1111111111111111,
 'regex_match_count': 1,
 'email_count': 0,
 'link_count': 0,
 'hashtag_count': 0,
 'mention_count': 0,
 'syllable_count': 3,
 'acronym_count': 1,
 'date_count': 0,
 'is_palindrome_extractor': False}

For a full working example please use the following Google Colab