Named Entity Recognition & Text Parsing
Raw text is unstructured — it's just a sequence of characters. But buried inside that text is structured information: names of people, organizations, dates, monetary amounts, and more. Named Entity Recognition (NER) is the task of automatically identifying and classifying these entities.
Why NER Matters
NER is a critical building block for:
Standard Entity Types
| Entity Type | Code | Examples |
|---|---|---|
| Person | PERSON | "Albert Einstein", "Marie Curie" |
| Organization | ORG | "Google", "United Nations" |
| Location | GPE / LOC | "Paris", "Mount Everest" |
| Date | DATE | "March 15, 2024", "last Tuesday" |
| Money | MONEY | "$1.5 billion", "500 euros" |
| Product | PRODUCT | "iPhone 15", "Tesla Model 3" |
The NLP Pipeline
spaCy: Industrial-Strength NLP
spaCy is the most popular library for production NLP pipelines. It's fast, accurate, and provides a unified API for tokenization, POS tagging, NER, and dependency parsing.
1import spacy
2
3# Load the pre-trained English model
4# Install first: python -m spacy download en_core_web_sm
5nlp = spacy.load("en_core_web_sm")
6
7# Process a text — runs the full pipeline
8text = "Apple Inc. was founded by Steve Jobs in Cupertino, California on April 1, 1976."
9doc = nlp(text)
10
11# --- Tokenization ---
12print("Tokens:")
13for token in doc:
14 print(f" {token.text:15s} POS: {token.pos_:6s} Dep: {token.dep_:10s}")
15
16# --- Named Entity Recognition ---
17print("\nEntities:")
18for ent in doc.ents:
19 print(f" {ent.text:25s} Label: {ent.label_:10s} Span: [{ent.start_char}:{ent.end_char}]")
20
21# Output:
22# Apple Inc. Label: ORG Span: [0:10]
23# Steve Jobs Label: PERSON Span: [26:36]
24# Cupertino Label: GPE Span: [40:49]
25# California Label: GPE Span: [51:61]
26# April 1, 1976 Label: DATE Span: [65:79]Part-of-Speech (POS) Tagging
POS tagging assigns a grammatical category to each token. This is essential for understanding sentence structure:
1import spacy
2
3nlp = spacy.load("en_core_web_sm")
4doc = nlp("The quick brown fox jumps over the lazy dog")
5
6# Detailed POS tags
7print(f"{'Token':<10} {'POS':<8} {'Fine POS':<8} {'Explanation'}")
8print("-" * 50)
9for token in doc:
10 print(f"{token.text:<10} {token.pos_:<8} {token.tag_:<8} {spacy.explain(token.tag_)}")
11
12# Output:
13# Token POS Fine POS Explanation
14# --------------------------------------------------
15# The DET DT determiner
16# quick ADJ JJ adjective
17# brown ADJ JJ adjective
18# fox NOUN NN noun, singular or mass
19# jumps VERB VBZ verb, 3rd person singular present
20# over ADP IN conjunction, subordinating or preposition
21# the DET DT determiner
22# lazy ADJ JJ adjective
23# dog NOUN NN noun, singular or massDependency Parsing
Dependency parsing identifies grammatical relationships between words — which word is the subject, which is the object, which modifies which:
1import spacy
2
3nlp = spacy.load("en_core_web_sm")
4doc = nlp("The CEO of Google announced a new product yesterday")
5
6# Dependency tree
7for token in doc:
8 print(f"{token.text:12s} --{token.dep_:10s}--> {token.head.text}")
9
10# Output:
11# The --det --> CEO
12# CEO --nsubj --> announced (subject of "announced")
13# of --prep --> CEO
14# Google --pobj --> of (object of "of")
15# announced --ROOT --> announced (root of the sentence)
16# a --det --> product
17# new --amod --> product (adjective modifier)
18# product --dobj --> announced (direct object of "announced")
19# yesterday --npadvmod --> announced (temporal modifier)
20
21# Extract subject-verb-object triples
22for token in doc:
23 if token.dep_ == "ROOT":
24 subjects = [child for child in token.children if "subj" in child.dep_]
25 objects = [child for child in token.children if "obj" in child.dep_]
26 if subjects and objects:
27 print(f"\nTriple: {subjects[0].text} → {token.text} → {objects[0].text}")
28 # Triple: CEO → announced → productRegex Patterns for Structured Extraction
Sometimes you need to extract well-defined patterns that don't require a neural model — emails, phone numbers, URLs, dates in specific formats. Regular expressions are the right tool here:
1import re
2
3text = """
4Contact John Smith at john.smith@example.com or call (555) 123-4567.
5The invoice total is $1,234.56 due on 2024-03-15.
6Visit https://www.example.com for more info.
7"""
8
9patterns = {
10 "email": r'[\w.+-]+@[\w-]+\.[\w.-]+',
11 "phone": r'\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}',
12 "money": r'\$[\d,]+\.?\d*',
13 "date": r'\d{4}-\d{2}-\d{2}',
14 "url": r'https?://[\w./\-]+',
15}
16
17for name, pattern in patterns.items():
18 matches = re.findall(pattern, text)
19 print(f"{name:8s}: {matches}")
20
21# email : ['john.smith@example.com']
22# phone : ['(555) 123-4567']
23# money : ['$1,234.56']
24# date : ['2024-03-15']
25# url : ['https://www.example.com']Regex vs. NER: When to Use Which
Custom NER Training with spaCy
Pre-trained models know about standard entity types, but your domain might have custom entities — drug names, legal clauses, product SKUs. You can train spaCy's NER on your own data:
1import spacy
2from spacy.tokens import DocBin
3from spacy.training import Example
4import random
5
6# --- Prepare training data ---
7# Format: (text, {"entities": [(start, end, label), ...]})
8TRAIN_DATA = [
9 ("Aspirin is used for headaches", {"entities": [(0, 7, "DRUG")]}),
10 ("Ibuprofen reduces inflammation", {"entities": [(0, 9, "DRUG")]}),
11 ("The patient was prescribed Metformin for diabetes",
12 {"entities": [(26, 35, "DRUG")]}),
13 ("Amoxicillin treats bacterial infections",
14 {"entities": [(0, 11, "DRUG")]}),
15 ("She takes Lisinopril for blood pressure",
16 {"entities": [(10, 20, "DRUG")]}),
17]
18
19# --- Create a blank model or start from existing ---
20nlp = spacy.blank("en") # Start fresh, or use spacy.load("en_core_web_sm")
21
22# Add NER pipe if not present
23if "ner" not in nlp.pipe_names:
24 ner = nlp.add_pipe("ner", last=True)
25else:
26 ner = nlp.get_pipe("ner")
27
28# Add custom entity label
29ner.add_label("DRUG")
30
31# --- Training loop ---
32optimizer = nlp.begin_training()
33
34for epoch in range(30):
35 random.shuffle(TRAIN_DATA)
36 losses = {}
37
38 for text, annotations in TRAIN_DATA:
39 doc = nlp.make_doc(text)
40 example = Example.from_dict(doc, annotations)
41 nlp.update([example], sgd=optimizer, losses=losses)
42
43 if epoch % 10 == 0:
44 print(f"Epoch {epoch:3d} | Loss: {losses['ner']:.4f}")
45
46# --- Test the trained model ---
47test_doc = nlp("The doctor recommended Aspirin and Ibuprofen for the pain")
48for ent in test_doc.ents:
49 print(f" {ent.text:15s} Label: {ent.label_}")Building a Complete Text Processing Pipeline
In practice, you'll combine multiple techniques into a pipeline:
1import spacy
2import re
3from collections import defaultdict
4
5def process_document(text, nlp_model):
6 """
7 Complete text processing pipeline:
8 1. spaCy NER for contextual entities
9 2. Regex for structured patterns
10 3. Dependency parsing for relationships
11 """
12 doc = nlp_model(text)
13
14 results = {
15 "entities": defaultdict(list),
16 "patterns": {},
17 "key_phrases": [],
18 }
19
20 # 1. Named Entities
21 for ent in doc.ents:
22 results["entities"][ent.label_].append({
23 "text": ent.text,
24 "start": ent.start_char,
25 "end": ent.end_char,
26 })
27
28 # 2. Regex patterns
29 results["patterns"]["emails"] = re.findall(r'[\w.+-]+@[\w-]+\.[\w.-]+', text)
30 results["patterns"]["dates"] = re.findall(r'\d{4}-\d{2}-\d{2}', text)
31 results["patterns"]["money"] = re.findall(r'\$[\d,]+\.?\d*', text)
32
33 # 3. Key noun phrases
34 for chunk in doc.noun_chunks:
35 results["key_phrases"].append(chunk.text)
36
37 return results
38
39# --- Use it ---
40nlp = spacy.load("en_core_web_sm")
41text = """
42On 2024-03-15, Google CEO Sundar Pichai announced a $2 billion investment
43in cloud infrastructure in London. Contact press@google.com for details.
44"""
45
46results = process_document(text, nlp)
47
48print("=== Entities ===")
49for label, ents in results["entities"].items():
50 print(f" {label}: {[e['text'] for e in ents]}")
51
52print("\n=== Patterns ===")
53for pattern, matches in results["patterns"].items():
54 print(f" {pattern}: {matches}")
55
56print("\n=== Key Phrases ===")
57for phrase in results["key_phrases"]:
58 print(f" - {phrase}")