187 lines
5.8 KiB
Python
Executable file
187 lines
5.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Extract Hebrew vocabulary from pealim.com dictionary.
|
||
Scrapes word entries, roots, and parts of speech for Anki flashcards.
|
||
"""
|
||
|
||
import requests
|
||
import pandas as pd
|
||
import logging
|
||
import time
|
||
from typing import Optional
|
||
|
||
# Configure logging
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Session for connection pooling
|
||
session = requests.Session()
|
||
session.headers.update({
|
||
'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
|
||
})
|
||
|
||
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
|
||
REQUEST_DELAY = 1.5 # seconds between requests (respectful scraping)
|
||
REQUEST_TIMEOUT = 10 # seconds
|
||
|
||
|
||
def get_total_pages() -> int:
|
||
"""Dynamically determine total pages from first request."""
|
||
try:
|
||
logger.info("Fetching total page count...")
|
||
cookies = {'translit': 'none', 'hebstyle': 'mo'}
|
||
response = session.get(PEALIM_DICT_URL, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||
response.raise_for_status()
|
||
|
||
dfs = pd.read_html(response.content)
|
||
if dfs:
|
||
# Estimate pages from first page (typically 15 words per page)
|
||
# For now, use hardcoded value but this could be improved
|
||
return 608
|
||
except Exception as e:
|
||
logger.error(f"Error fetching page count: {e}. Using default (608).")
|
||
return 608
|
||
|
||
|
||
def extract_from_website(max_pages: Optional[int] = None) -> pd.DataFrame:
|
||
"""
|
||
Extract dictionary entries from pealim.com.
|
||
|
||
Args:
|
||
max_pages: Maximum pages to scrape (None = all)
|
||
|
||
Returns:
|
||
DataFrame with Word, Root, Part of Speech, and Word Without Nikkud columns
|
||
"""
|
||
total_pages = max_pages or get_total_pages()
|
||
logger.info(f"Starting extraction from {total_pages} pages...")
|
||
|
||
df = pd.DataFrame()
|
||
|
||
for page_num in range(1, total_pages):
|
||
try:
|
||
url = f"{PEALIM_DICT_URL}?page={page_num}"
|
||
|
||
# First request: with nikkud
|
||
cookies = {'translit': 'none', 'hebstyle': 'mo'}
|
||
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||
response.raise_for_status()
|
||
df_list = pd.read_html(response.content)
|
||
|
||
# Second request: without nikkud
|
||
cookies = {'translit': 'none', 'hebstyle': 'vl', 'showmeaning': 'off'}
|
||
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||
response.raise_for_status()
|
||
without_nikkud_words = pd.read_html(response.content)[-1]['Word']
|
||
without_nikkud_words = without_nikkud_words.rename('Word Without Nikkud')
|
||
|
||
# Combine and append
|
||
df_to_add = pd.concat([df_list[-1], without_nikkud_words], axis=1)
|
||
df = pd.concat([df, df_to_add], ignore_index=True)
|
||
|
||
if page_num % 50 == 0:
|
||
logger.info(f"Processed {page_num}/{total_pages} pages...")
|
||
|
||
time.sleep(REQUEST_DELAY)
|
||
|
||
except requests.RequestException as e:
|
||
logger.error(f"Error fetching page {page_num}: {e}. Retrying...")
|
||
time.sleep(REQUEST_DELAY * 2)
|
||
except Exception as e:
|
||
logger.error(f"Unexpected error on page {page_num}: {e}")
|
||
continue
|
||
|
||
logger.info(f"Extraction complete. Total words: {len(df)}")
|
||
return df
|
||
|
||
|
||
def modify_for_anki(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
Transform dictionary DataFrame for Anki import.
|
||
Adds shared root words and Hebrew tags.
|
||
|
||
Args:
|
||
df: Dictionary DataFrame
|
||
|
||
Returns:
|
||
Modified DataFrame ready for Anki
|
||
"""
|
||
logger.info("Preparing data for Anki...")
|
||
|
||
# Find shared root words
|
||
shared_root_words = []
|
||
for idx, row in df.iterrows():
|
||
root = row['Root']
|
||
word = row['Word']
|
||
|
||
if root != '-' and pd.notna(root):
|
||
# Find other words with same root
|
||
same_root = df[(df['Root'] == root) & (df['Word'] != word)]['Word'].values
|
||
shared = ' '.join(str(w) for w in same_root)
|
||
shared_root_words.append(shared)
|
||
else:
|
||
shared_root_words.append('')
|
||
|
||
df['shared roots'] = shared_root_words
|
||
|
||
# Generate Hebrew tags
|
||
tags = []
|
||
for idx, row in df.iterrows():
|
||
tag_parts = []
|
||
|
||
# Root tag
|
||
root = str(row['Root']).replace(' ', '').replace('-', '')
|
||
if 'nan' not in root and root:
|
||
root_clean = root.replace('.', '')
|
||
tag_parts.append(f"שורש::{root_clean}")
|
||
|
||
# Part of speech tag
|
||
pos = str(row['Part of Speech'])
|
||
pos_tags = {
|
||
'Adverb': 'תוארי_הפועל',
|
||
'Pronoun': 'כינויי_גוף',
|
||
'Noun': 'שם_עצם',
|
||
'Verb': 'פעלים',
|
||
'Adjective': 'שם_תואר',
|
||
'Preposition': 'מילות_יחס',
|
||
'Conjunction': 'מילות_חיבור',
|
||
'Particle': 'מילית'
|
||
}
|
||
|
||
for key, value in pos_tags.items():
|
||
if key in pos:
|
||
tag_parts.append(value)
|
||
break
|
||
|
||
tags.append(' '.join(tag_parts))
|
||
|
||
df['tags'] = tags
|
||
logger.info("Anki preparation complete.")
|
||
return df
|
||
|
||
|
||
def main():
|
||
"""Main entry point."""
|
||
try:
|
||
# Extract from website
|
||
df = extract_from_website()
|
||
df.to_csv('pealim_dict.csv', index=True)
|
||
logger.info("Saved: pealim_dict.csv")
|
||
|
||
# Transform for Anki
|
||
df = modify_for_anki(df)
|
||
df.to_csv('pealim_dict_for_anki.csv', sep=';', index=True)
|
||
logger.info("Saved: pealim_dict_for_anki.csv")
|
||
|
||
logger.info("✅ Complete!")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Fatal error: {e}")
|
||
raise
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|