# main.py
# A demonstration of fast keyword searching using the FlashText library.
#
# Before running, you will need to install flashtext:
# pip install flashtext
from flashtext import KeywordProcessor
print("--- Starting Fast Keyword Matching Demonstration with FlashText ---")
# --- Section 1: Define Keywords and Text ---
# We define a set of keywords we want to find and a body of text to search within.
# FlashText can handle hundreds of thousands of keywords efficiently.
keyword_dict = {
"data science": ["Data Science", "data scientist"],
"machine learning": ["Machine Learning", "ML"],
"artificial intelligence": ["Artificial Intelligence", "AI", "A.I."],
"new york": ["New York City", "NYC"]
}
text_to_search = """
Dr. Alice Smith, a brilliant data scientist from Google, is presenting her
groundbreaking research on Artificial Intelligence in New York City today.
Her work in ML has been pivotal.
"""
print("\n--- 1. Text to be Searched ---")
print(text_to_search)
print("\n--- Keywords to Find ---")
print(keyword_dict)
# --- Section 2: Initialize Keyword Processor and Add Keywords ---
# We create a KeywordProcessor and add our dictionary of keywords to it.
# The dictionary keys are the "clean" names, and the values are the different
# variations we want to match.
print("\n--- 2. Initializing FlashText Keyword Processor ---")
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_dict(keyword_dict)
print("Keyword processor is ready.")
# --- Section 3: Extract Keywords from Text ---
# The extract_keywords method finds all occurrences of the added keywords in the text.
print("\n--- 3. Extracting Keywords ---")
found_keywords = keyword_processor.extract_keywords(text_to_search)
print("\nFound keywords (clean names):")
print(found_keywords)
# To get more details, like the original word and its position:
print("\n--- 4. Extracting Keywords with Span Information ---")
found_keywords_with_span = keyword_processor.extract_keywords(text_to_search, span_info=True)
print("\nFound keywords with their start and end positions:")
for keyword, start, end in found_keywords_with_span:
print(f"- Found '{keyword}' (Original text: '{text_to_search[start:end]}') at index {start}:{end}")
print("\n--- End of Demonstration ---")