# main.py
# An alternative demonstration of NLP using the spaCy library.
# This program performs the same tasks as the NLTK version.
#
# Before running, you will need to install spaCy and its English model:
# 1. pip install spacy
# 2. python -m spacy download en_core_web_sm
import spacy
from collections import Counter
print("--- Starting Natural Language Processing Demonstration with spaCy ---")
# --- Section 1: Load the spaCy Model ---
# We load the small English model. This object contains all the
# processing pipeline components (tokenizer, tagger, parser, NER, etc.).
try:
nlp = spacy.load("en_core_web_sm")
print("\n[+] spaCy English model 'en_core_web_sm' loaded successfully.")
except OSError:
print("\n[-] spaCy model not found. Please run:")
print("python -m spacy download en_core_web_sm")
exit()
# --- Section 2: Sample Text ---
text = "Dr. Alice Smith, a brilliant scientist from Google, is presenting her groundbreaking research on AI in New York City today."
print(f"\n--- 1. Original Text ---\n'{text}'")
# Process the text with the spaCy pipeline
doc = nlp(text)
# --- Section 3: Basic Preprocessing ---
# spaCy performs tokenization, lemmatization, and stop word detection all at once.
print("\n--- 2. Basic Preprocessing ---")
# We can create a list of cleaned, lemmatized tokens.
# A token's lemma is its base form. is_stop checks if it's a stop word.
lemmatized_tokens = [
token.lemma_ for token in doc if not token.is_stop and not token.is_punct
]
print("\nFinal list of preprocessed (lemmatized) words:")
print(lemmatized_tokens)
# --- Section 4: Part-of-Speech (POS) Tagging ---
# Accessing the .pos_ attribute of each token gives its simple part-of-speech tag.
print("\n--- 3. Part-of-Speech (POS) Tagging ---")
print("POS tags for each word:")
for token in doc:
print(f"- '{token.text}': {token.pos_}")
# --- Section 5: Named Entity Recognition (NER) ---
# spaCy's 'doc' object has an 'ents' property containing the named entities.
print("\n--- 4. Named Entity Recognition (NER) ---")
print("Named Entities found in the text:")
if doc.ents:
for ent in doc.ents:
# .label_ gives the entity type (e.g., PERSON, ORG, GPE for location)
print(f"- Entity: '{ent.text}', Type: {ent.label_}")
else:
print("No named entities found.")
# --- Section 6: Frequency Distribution ---
# We can use Python's built-in Counter to get word frequencies.
print("\n--- 5. Word Frequency Distribution ---")
# We use the cleaned, lemmatized tokens for a more accurate count.
word_freq = Counter(lemmatized_tokens)
print("Most common words in the text:")
print(word_freq.most_common(5))
print("\n--- End of Demonstration ---")