Study | StudyLover

Natural Language Processing Pipeline Demonstration with spaCy

Download

Program calculates Haralick texture features : Program for Fast Keyword Matching with FlashText

Programs

# main.py

# An alternative demonstration of NLP using the spaCy library.

# This program performs the same tasks as the NLTK version.

# Before running, you will need to install spaCy and its English model:

# 1. pip install spacy

# 2. python -m spacy download en_core_web_sm

import spacy

from collections import Counter

print("--- Starting Natural Language Processing Demonstration with spaCy ---")

# --- Section 1: Load the spaCy Model ---

# We load the small English model. This object contains all the

# processing pipeline components (tokenizer, tagger, parser, NER, etc.).

try:

nlp = spacy.load("en_core_web_sm")

print("\n[+] spaCy English model 'en_core_web_sm' loaded successfully.")

except OSError:

print("\n[-] spaCy model not found. Please run:")

print("python -m spacy download en_core_web_sm")

exit()

# --- Section 2: Sample Text ---

text = "Dr. Alice Smith, a brilliant scientist from Google, is presenting her groundbreaking research on AI in New York City today."

print(f"\n--- 1. Original Text ---\n'{text}'")

# Process the text with the spaCy pipeline

doc = nlp(text)

# --- Section 3: Basic Preprocessing ---

# spaCy performs tokenization, lemmatization, and stop word detection all at once.

print("\n--- 2. Basic Preprocessing ---")

# We can create a list of cleaned, lemmatized tokens.

# A token's lemma is its base form. is_stop checks if it's a stop word.

lemmatized_tokens = [

token.lemma_ for token in doc if not token.is_stop and not token.is_punct

]

print("\nFinal list of preprocessed (lemmatized) words:")

print(lemmatized_tokens)

# --- Section 4: Part-of-Speech (POS) Tagging ---

# Accessing the .pos_ attribute of each token gives its simple part-of-speech tag.

print("\n--- 3. Part-of-Speech (POS) Tagging ---")

print("POS tags for each word:")

for token in doc:

print(f"- '{token.text}': {token.pos_}")

# --- Section 5: Named Entity Recognition (NER) ---

# spaCy's 'doc' object has an 'ents' property containing the named entities.

print("\n--- 4. Named Entity Recognition (NER) ---")

print("Named Entities found in the text:")

if doc.ents:

for ent in doc.ents:

# .label_ gives the entity type (e.g., PERSON, ORG, GPE for location)

print(f"- Entity: '{ent.text}', Type: {ent.label_}")

else:

print("No named entities found.")

# --- Section 6: Frequency Distribution ---

# We can use Python's built-in Counter to get word frequencies.

print("\n--- 5. Word Frequency Distribution ---")

# We use the cleaned, lemmatized tokens for a more accurate count.

word_freq = Counter(lemmatized_tokens)

print("Most common words in the text:")

print(word_freq.most_common(5))

print("\n--- End of Demonstration ---")

Program calculates Haralick texture features Program for Fast Keyword Matching with FlashText