Migrate from fragmented CSV + 10 JSON files to a single data/words.json (9,104 entries) as the unified data store. All GUIDs preserved for Anki study progress continuity. New files: - SCHEMA.yaml: authoritative schema for words.json - pealim_list_scrape.py: consolidated list page scraper → words.json - pealim_detail_scrape.py: noun/verb detail scraper → words.json - pealim_audio_download.py: audio downloader reading from words.json - scripts/migrate_to_json.py: one-time CSV→JSON migration - scripts/validate_data.py: 17 data integrity tests - scripts/check_guid_coverage.py: GUID preservation checker - scripts/repair_slugs.py: slug deduplication repair tool - tests/test_scraper_integration.py: live scraper integration tests Updated: - apkg_builder.py: reads from words.json (no more pandas) - run.py: 8-step pipeline (list scrape → frequency → examples → detail scrape → audio download → fonts → images → build) - benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers for future words.json integration Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
83 lines
2 KiB
TOML
83 lines
2 KiB
TOML
[project]
|
|
name = "hebrew-flash-cards"
|
|
version = "0.13"
|
|
description = "Hebrew vocabulary & verb conjugation flashcards for Anki"
|
|
requires-python = ">=3.11"
|
|
dependencies = [
|
|
"beautifulsoup4>=4.11.0",
|
|
"genanki>=0.8.0",
|
|
"lxml>=4.9.0",
|
|
"numpy>=1.21.0",
|
|
"pandas>=1.3.0",
|
|
"pymupdf>=1.23.0",
|
|
"pypdf>=3.0.0",
|
|
"python-bidi>=0.4.2",
|
|
"requests>=2.26.0",
|
|
]
|
|
|
|
[project.optional-dependencies]
|
|
dev = [
|
|
"bandit",
|
|
"pytest",
|
|
"ruff",
|
|
"vulture",
|
|
]
|
|
|
|
[tool.pytest.ini_options]
|
|
testpaths = ["tests"]
|
|
markers = [
|
|
"integration: marks tests that hit the real pealim.com network (deselect with -m 'not integration')",
|
|
]
|
|
|
|
[tool.ruff]
|
|
target-version = "py311"
|
|
line-length = 120
|
|
exclude = [
|
|
"lib/",
|
|
"bin/",
|
|
"include/",
|
|
"lib64/",
|
|
"archive/",
|
|
"venv/",
|
|
]
|
|
|
|
[tool.ruff.lint]
|
|
select = [
|
|
"E", # pycodestyle errors
|
|
"W", # pycodestyle warnings
|
|
"F", # pyflakes
|
|
"I", # isort
|
|
"UP", # pyupgrade
|
|
"B", # flake8-bugbear
|
|
"SIM", # flake8-simplify
|
|
"PIE", # flake8-pie
|
|
"T20", # flake8-print (flag print statements)
|
|
"RET", # flake8-return
|
|
"C4", # flake8-comprehensions
|
|
"S", # flake8-bandit (security)
|
|
]
|
|
ignore = [
|
|
"T201", # allow print() — this is a CLI tool, not a library
|
|
"S603", # subprocess call with shell=False is fine
|
|
"S607", # partial executable path is fine for CLI tools
|
|
"S105", # PASS = "✓" is not a password
|
|
"S108", # /tmp paths are intentional for temp downloads
|
|
"S311", # random.Random() is for card ordering, not crypto
|
|
"E501", # line too long — handled by formatter
|
|
]
|
|
|
|
[tool.ruff.lint.per-file-ignores]
|
|
"test_*.py" = ["S101"] # allow assert in tests
|
|
|
|
[tool.ruff.format]
|
|
quote-style = "double"
|
|
indent-style = "space"
|
|
|
|
[tool.vulture]
|
|
paths = ["."]
|
|
exclude = ["lib/", "bin/", "include/", "lib64/", "venv/", "archive/"]
|
|
min_confidence = 80
|
|
|
|
[tool.bandit]
|
|
exclude_dirs = ["lib", "bin", "include", "lib64", "venv", "archive"]
|
|
skips = ["B101"] # allow assert
|