hebrew_flash_cards/pealim_extract.py

187 lines
5.8 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Extract Hebrew vocabulary from pealim.com dictionary.
Scrapes word entries, roots, and parts of speech for Anki flashcards.
"""
import requests
import pandas as pd
import logging
import time
from typing import Optional
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Session for connection pooling
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
})
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
REQUEST_DELAY = 1.5 # seconds between requests (respectful scraping)
REQUEST_TIMEOUT = 10 # seconds
def get_total_pages() -> int:
"""Dynamically determine total pages from first request."""
try:
logger.info("Fetching total page count...")
cookies = {'translit': 'none', 'hebstyle': 'mo'}
response = session.get(PEALIM_DICT_URL, cookies=cookies, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
dfs = pd.read_html(response.content)
if dfs:
# Estimate pages from first page (typically 15 words per page)
# For now, use hardcoded value but this could be improved
return 608
except Exception as e:
logger.error(f"Error fetching page count: {e}. Using default (608).")
return 608
def extract_from_website(max_pages: Optional[int] = None) -> pd.DataFrame:
"""
Extract dictionary entries from pealim.com.
Args:
max_pages: Maximum pages to scrape (None = all)
Returns:
DataFrame with Word, Root, Part of Speech, and Word Without Nikkud columns
"""
total_pages = max_pages or get_total_pages()
logger.info(f"Starting extraction from {total_pages} pages...")
df = pd.DataFrame()
for page_num in range(1, total_pages):
try:
url = f"{PEALIM_DICT_URL}?page={page_num}"
# First request: with nikkud
cookies = {'translit': 'none', 'hebstyle': 'mo'}
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
df_list = pd.read_html(response.content)
# Second request: without nikkud
cookies = {'translit': 'none', 'hebstyle': 'vl', 'showmeaning': 'off'}
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
without_nikkud_words = pd.read_html(response.content)[-1]['Word']
without_nikkud_words = without_nikkud_words.rename('Word Without Nikkud')
# Combine and append
df_to_add = pd.concat([df_list[-1], without_nikkud_words], axis=1)
df = pd.concat([df, df_to_add], ignore_index=True)
if page_num % 50 == 0:
logger.info(f"Processed {page_num}/{total_pages} pages...")
time.sleep(REQUEST_DELAY)
except requests.RequestException as e:
logger.error(f"Error fetching page {page_num}: {e}. Retrying...")
time.sleep(REQUEST_DELAY * 2)
except Exception as e:
logger.error(f"Unexpected error on page {page_num}: {e}")
continue
logger.info(f"Extraction complete. Total words: {len(df)}")
return df
def modify_for_anki(df: pd.DataFrame) -> pd.DataFrame:
"""
Transform dictionary DataFrame for Anki import.
Adds shared root words and Hebrew tags.
Args:
df: Dictionary DataFrame
Returns:
Modified DataFrame ready for Anki
"""
logger.info("Preparing data for Anki...")
# Find shared root words
shared_root_words = []
for idx, row in df.iterrows():
root = row['Root']
word = row['Word']
if root != '-' and pd.notna(root):
# Find other words with same root
same_root = df[(df['Root'] == root) & (df['Word'] != word)]['Word'].values
shared = ' '.join(str(w) for w in same_root)
shared_root_words.append(shared)
else:
shared_root_words.append('')
df['shared roots'] = shared_root_words
# Generate Hebrew tags
tags = []
for idx, row in df.iterrows():
tag_parts = []
# Root tag
root = str(row['Root']).replace(' ', '').replace('-', '')
if 'nan' not in root and root:
root_clean = root.replace('.', '')
tag_parts.append(f"שורש::{root_clean}")
# Part of speech tag
pos = str(row['Part of Speech'])
pos_tags = {
'Adverb': 'תוארי_הפועל',
'Pronoun': 'כינוייוף',
'Noun': 'שם_עצם',
'Verb': 'פעלים',
'Adjective': 'שם_תואר',
'Preposition': 'מילות_יחס',
'Conjunction': 'מילות_חיבור',
'Particle': 'מילית'
}
for key, value in pos_tags.items():
if key in pos:
tag_parts.append(value)
break
tags.append(' '.join(tag_parts))
df['tags'] = tags
logger.info("Anki preparation complete.")
return df
def main():
"""Main entry point."""
try:
# Extract from website
df = extract_from_website()
df.to_csv('pealim_dict.csv', index=True)
logger.info("Saved: pealim_dict.csv")
# Transform for Anki
df = modify_for_anki(df)
df.to_csv('pealim_dict_for_anki.csv', sep=';', index=True)
logger.info("Saved: pealim_dict_for_anki.csv")
logger.info("✅ Complete!")
except Exception as e:
logger.error(f"Fatal error: {e}")
raise
if __name__ == '__main__':
main()