Topic Identifier API for Python#
- class eot.wowool.topic_identifier.TopicIdentifier#
The model contains information about topic candidates and in how many documents in the collection they appear.
- __init__(language: str, count: int = 5, threshold: int = 0, topic_model: str = '', domains: Optional[List[Union[str, eot.wowool.native.core.domain.Domain]]] = None, ignore_entities: bool = False, engine: Optional[eot.wowool.native.core.engine.Engine] = None)#
- Parameters
language (str) – Language to process the input document.
count (str) – The number of topics to be returned. default = 5
threshold (str) – The lower threshold in percentage. [0-100]
topic_model (str) – The reference file created with create_topic_model.
domains (list[str, Domain]) – List of domains you want to process before generating topics
engine (eot.wowool.native.core.Engine) – The engine that will cache the domains and models.
from eot.wowool.native.core import Language from eot.wowool.topic_identifier.topic_identifier import TopicIdentifier from eot.test.corpus import Corpus from eot.wowool.document import Document english = Language("english") topic_it = TopicIdentifier("english", count=5) # add the movie folder, containing all the movie files. corpus = Corpus("english/movies") print("Adding the corpus files to the model for better results.") # Note: this is not strictly necessary. for ip in corpus: print(f"adding: {ip.id}") topic_it.add(english(Document(ip))) print("Topics/file") # display the results of every file, by iterating over every file. for ip in corpus: doc = english(Document(ip)) doc = topic_it(doc) topics = doc.results('eot_topics') print(f"# {doc.id}") for topic in topics: print(f" - {topic}")
- __call__(document: eot.wowool.document.document.Document, model: Optional[eot.wowool.topic_identifier.topic_identifier.Model] = None) eot.wowool.document.document.Document #
Add topics to a given Document object
- Parameters
document (eot.wowool.Document) – The Document object we want to add the topics to.
from eot.wowool.native.core import Language from eot.wowool.topic_identifier import TopicIdentifier english = Language("english") number_of_topics = 5 topic_it = TopicIdentifier(language="english", count=number_of_topics) # display the results of every file, by iterating over every file. document = topic_it(english("This is the effect of the green house gases")) for topic in document.results('eot_topics'): print(f" - {topic}")