#!/usr/bin/env python3
"""
Lottery Ticket OCR Script
Extracts lottery numbers from a multi-page PDF of scanned lottery tickets.
Outputs: CSV with columns: page, row, num1, num2, num3, num4, num5, pb_num
"""
import csv
import re
import sys
import argparse
from pathlib import Path
try:
import pdf2image
from PIL import Image
import pytesseract
except ImportError:
print("Missing required packages. Installing...")
import subprocess
subprocess.check_call(
[
sys.executable,
"-m",
"pip",
"install",
"pdf2image",
"pytesseract",
"Pillow",
"--break-system-packages",
]
)
import pdf2image
from PIL import Image
import pytesseract
# ============================================================================
# CROP SETTINGS - ADJUST THESE VALUES
# ============================================================================
# Set to None to disable cropping, or define (left, top, right, bottom) in pixels
# Example: CROP_BOX = (100, 200, 800, 1000) means crop from x=100,y=200 to x=800,y=1000
# Suggested crop for the ticket area (based on preview image)
# This focuses on just the lottery numbers section (rows A-J)
# Adjust these values if needed based on your actual page dimensions
CROP_BOX = (50, 440, 620, 770) # Crops to the ticket rows area
# You can also define multiple crop boxes if tickets are in different regions
# CROP_BOXES = [
# (100, 200, 800, 600), # First ticket area
# (100, 650, 800, 1050), # Second ticket area
# ]
CROP_BOXES = None
# ============================================================================
def extract_numbers_from_text(text, page_num, debug=False):
"""
Extract lottery numbers from OCR text.
Looks for patterns like: A1924394565P:22Q6 or A. 19 24 39 45 65 Powerball: 22 Q6
Returns list of dictionaries with page, row, num1-5, pb_num
"""
if debug:
print(f"\n{'='*60}")
print(f"OCR TEXT FOR PAGE {page_num}:")
print(f"{'='*60}")
print(text)
print(f"{'='*60}\n")
results = []
lines = text.split("\n")
for line in lines:
# Skip empty lines
if not line.strip():
continue
# Pattern 1: Compact format like "A1924394565P:22Q6" or "I. 20 38 42 53 58 Powerball: 14 Q6"
# Matches: Letter + 10 digits (5 pairs) + P: + 2 digits
# More flexible to handle spacing variations
compact_match = re.search(
r"([A-J])[\.\s]*(\d{2})\s*(\d{2})\s*(\d{2})\s*(\d{2})\s*(\d{2})\s*(?:Powerball:|P:?)\s*(\d{2})",
line,
re.IGNORECASE,
)
if compact_match:
result = {
"page": page_num,
"row": compact_match.group(1),
"num1": compact_match.group(2),
"num2": compact_match.group(3),
"num3": compact_match.group(4),
"num4": compact_match.group(5),
"num5": compact_match.group(6),
"pb_num": compact_match.group(7),
}
results.append(result)
if debug:
print(f"Line: {line}")
print(f" ✓ Compact format extracted: {result}")
continue
# Pattern 2: Spaced format like "A. 19 24 39 45 65 Powerball: 22 Q6"
# First find the row letter (A-J at start of line or after whitespace, followed by . or space)
row_match = re.search(r"(?:^|\s)([A-J])[\.\s]", line)
if not row_match:
continue
row_letter = row_match.group(1)
# Extract all 2-digit numbers from the line
numbers = re.findall(r"\b(\d{2})\b", line)
if debug:
print(f"Line: {line}")
print(f" Row: {row_letter}, Numbers found: {numbers}")
if len(numbers) >= 6: # Need at least 5 regular numbers + 1 powerball
# Take first 5 as regular numbers, 6th as powerball
result = {
"page": page_num,
"row": row_letter,
"num1": numbers[0],
"num2": numbers[1],
"num3": numbers[2],
"num4": numbers[3],
"num5": numbers[4],
"pb_num": numbers[5],
}
results.append(result)
if debug:
print(f" ✓ Spaced format extracted: {result}")
return results
def process_pdf(
pdf_path, output_csv, debug=False, preview_pages=None, save_images=False
):
"""
Process PDF file and extract lottery numbers to CSV.
Args:
pdf_path: Path to the PDF file
output_csv: Path to output CSV file
debug: If True, print OCR text and extraction details
preview_pages: List of page numbers to preview (e.g., [1, 2, 3])
save_images: If True, save processed images to disk
"""
print(f"Processing PDF: {pdf_path}")
print("Converting PDF to images... (this may take a while)")
# Convert PDF to images
try:
if preview_pages:
# Only convert specified pages for preview
images = []
for page_num in preview_pages:
img = pdf2image.convert_from_path(
pdf_path, first_page=page_num, last_page=page_num
)
images.extend(img)
else:
images = pdf2image.convert_from_path(pdf_path)
except Exception as e:
print(f"Error converting PDF: {e}")
print("Make sure poppler-utils is installed:")
print(" Ubuntu/Debian: sudo apt-get install poppler-utils")
print(" macOS: brew install poppler")
return
print(f"PDF converted to {len(images)} images")
# Preview mode - save images and show dimensions
if save_images or preview_pages:
print("\nSaving preview images...")
for i, image in enumerate(images):
page_num = preview_pages[i] if preview_pages else i + 1
output_path = f"page_{page_num}_preview.png"
image.save(output_path)
print(f"Saved: {output_path} (Size: {image.width}x{image.height})")
# If cropping is enabled, also save cropped versions
if CROP_BOX:
cropped = image.crop(CROP_BOX)
crop_path = f"page_{page_num}_cropped.png"
cropped.save(crop_path)
print(
f"Saved cropped: {crop_path} (Size: {cropped.width}x{cropped.height})"
)
if CROP_BOXES:
for j, box in enumerate(CROP_BOXES):
cropped = image.crop(box)
crop_path = f"page_{page_num}_crop_{j+1}.png"
cropped.save(crop_path)
print(
f"Saved cropped region {j+1}: {crop_path} (Size: {cropped.width}x{cropped.height})"
)
all_results = []
# Process each page
for idx, image in enumerate(images):
page_num = preview_pages[idx] if preview_pages else idx + 1
print(f"Processing page {page_num}...", end="\r" if not debug else "\n")
# Apply cropping if enabled
images_to_process = []
if CROP_BOX:
images_to_process.append(image.crop(CROP_BOX))
elif CROP_BOXES:
images_to_process = [image.crop(box) for box in CROP_BOXES]
else:
images_to_process = [image]
# Process each image/crop
for proc_img in images_to_process:
# Perform OCR on the image
# Using config to improve number recognition
custom_config = r"--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:)PB"
text = pytesseract.image_to_string(proc_img, config=custom_config)
# Extract numbers from OCR text
results = extract_numbers_from_text(text, page_num, debug=debug)
all_results.extend(results)
print(f"\nExtracted {len(all_results)} lottery ticket rows")
# Write to CSV
if all_results:
with open(output_csv, "w", newline="") as csvfile:
fieldnames = [
"page",
"row",
"num1",
"num2",
"num3",
"num4",
"num5",
"pb_num",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for result in all_results:
writer.writerow(result)
print(f"Results saved to: {output_csv}")
else:
print(
"No lottery numbers found. You may need to adjust the OCR settings or pattern matching."
)
if not debug:
print("\nTry running with --debug flag to see the OCR output:")
print(f" python lottery_ocr.py {pdf_path} --debug --preview 1")
def main():
parser = argparse.ArgumentParser(
description="Extract lottery numbers from PDF to CSV",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Preview first page with debug output
python lottery_ocr.py tickets.pdf --debug --preview 1
# Preview pages 1-3 and save images
python lottery_ocr.py tickets.pdf --preview 1 2 3 --save-images
# Process entire PDF
python lottery_ocr.py tickets.pdf output.csv
# Process with debug output
python lottery_ocr.py tickets.pdf --debug
After previewing, edit CROP_BOX in the script:
CROP_BOX = (left, top, right, bottom)
Example: CROP_BOX = (100, 200, 800, 1000)
""",
)
parser.add_argument("pdf_file", help="Path to PDF file")
parser.add_argument(
"output_csv",
nargs="?",
help="Output CSV file (default: <pdf_name>_results.csv)",
)
parser.add_argument(
"--debug",
"-d",
action="store_true",
help="Show OCR text and extraction details",
)
parser.add_argument(
"--preview",
"-p",
nargs="+",
type=int,
metavar="PAGE",
help="Preview specific pages (e.g., --preview 1 2 3)",
)
parser.add_argument(
"--save-images",
"-s",
action="store_true",
help="Save page images to disk for inspection",
)
args = parser.parse_args()
pdf_path = args.pdf_file
if not Path(pdf_path).exists():
print(f"Error: File not found: {pdf_path}")
sys.exit(1)
# Default output name
if args.output_csv:
output_csv = args.output_csv
else:
output_csv = Path(pdf_path).stem + "_results.csv"
# Show crop settings if enabled
if CROP_BOX:
print(f"Using CROP_BOX: {CROP_BOX}")
if CROP_BOXES:
print(f"Using CROP_BOXES: {CROP_BOXES}")
process_pdf(
pdf_path,
output_csv,
debug=args.debug,
preview_pages=args.preview,
save_images=args.save_images,
)
if __name__ == "__main__":
main()