hebrew_flash_cards/pyproject.toml
Sochen 08fb7009d8 Sprint 11: unified JSON architecture + consolidated scraping pipeline
Migrate from fragmented CSV + 10 JSON files to a single data/words.json
(9,104 entries) as the unified data store. All GUIDs preserved for Anki
study progress continuity.

New files:
- SCHEMA.yaml: authoritative schema for words.json
- pealim_list_scrape.py: consolidated list page scraper → words.json
- pealim_detail_scrape.py: noun/verb detail scraper → words.json
- pealim_audio_download.py: audio downloader reading from words.json
- scripts/migrate_to_json.py: one-time CSV→JSON migration
- scripts/validate_data.py: 17 data integrity tests
- scripts/check_guid_coverage.py: GUID preservation checker
- scripts/repair_slugs.py: slug deduplication repair tool
- tests/test_scraper_integration.py: live scraper integration tests

Updated:
- apkg_builder.py: reads from words.json (no more pandas)
- run.py: 8-step pipeline (list scrape → frequency → examples →
  detail scrape → audio download → fonts → images → build)
- benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers
  for future words.json integration

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 10:54:58 +00:00

83 lines
2 KiB
TOML

[project]
name = "hebrew-flash-cards"
version = "0.13"
description = "Hebrew vocabulary & verb conjugation flashcards for Anki"
requires-python = ">=3.11"
dependencies = [
"beautifulsoup4>=4.11.0",
"genanki>=0.8.0",
"lxml>=4.9.0",
"numpy>=1.21.0",
"pandas>=1.3.0",
"pymupdf>=1.23.0",
"pypdf>=3.0.0",
"python-bidi>=0.4.2",
"requests>=2.26.0",
]
[project.optional-dependencies]
dev = [
"bandit",
"pytest",
"ruff",
"vulture",
]
[tool.pytest.ini_options]
testpaths = ["tests"]
markers = [
"integration: marks tests that hit the real pealim.com network (deselect with -m 'not integration')",
]
[tool.ruff]
target-version = "py311"
line-length = 120
exclude = [
"lib/",
"bin/",
"include/",
"lib64/",
"archive/",
"venv/",
]
[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort
"UP", # pyupgrade
"B", # flake8-bugbear
"SIM", # flake8-simplify
"PIE", # flake8-pie
"T20", # flake8-print (flag print statements)
"RET", # flake8-return
"C4", # flake8-comprehensions
"S", # flake8-bandit (security)
]
ignore = [
"T201", # allow print() — this is a CLI tool, not a library
"S603", # subprocess call with shell=False is fine
"S607", # partial executable path is fine for CLI tools
"S105", # PASS = "✓" is not a password
"S108", # /tmp paths are intentional for temp downloads
"S311", # random.Random() is for card ordering, not crypto
"E501", # line too long — handled by formatter
]
[tool.ruff.lint.per-file-ignores]
"test_*.py" = ["S101"] # allow assert in tests
[tool.ruff.format]
quote-style = "double"
indent-style = "space"
[tool.vulture]
paths = ["."]
exclude = ["lib/", "bin/", "include/", "lib64/", "venv/", "archive/"]
min_confidence = 80
[tool.bandit]
exclude_dirs = ["lib", "bin", "include", "lib64", "venv", "archive"]
skips = ["B101"] # allow assert