From f7ee692eed6ed8ebbe8c68805a7e6b6dd0f09761 Mon Sep 17 00:00:00 2001 From: nevo Date: Mon, 21 Jul 2025 01:43:47 -0700 Subject: [PATCH] added extraction of verb conjugations --- .gitignore | 10 ++++++++++ conjugation_extract.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 .gitignore create mode 100755 conjugation_extract.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7bcbe54 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +archive +nikkud.csv +practice.py +cardinal_one_to_ten.* +*.swp +bin** +lib** +include** +lib64** +pyvenv.cfg diff --git a/conjugation_extract.py b/conjugation_extract.py new file mode 100755 index 0000000..49ce307 --- /dev/null +++ b/conjugation_extract.py @@ -0,0 +1,28 @@ +#!./bin/python3 +import requests +import pandas as pd +import numpy as np + + +def extract_from_website(): + # Number of total pages of dictionary in pealim.com/dict/ + # i.e. Number Of Words / 15 + columns = ['present ms', 'present fs', 'present mp' , 'present fp', 'past 1s', 'past 1p', 'past 2ms', 'past 2fs', 'past 2mp', 'past 2fp', 'past 3ms', 'past 3fs', 'past 3p', 'future 1s', 'future 1p', 'future 2ms', 'future 2fs', 'future 2mp', 'future 2fp', 'future 3ms', 'future 3fs', 'future 3mp', 'future 3fp', 'imperative ms', 'imperative fs', 'imperative mp', 'imperative fp', 'infinitive'] + + url_suffixes = ['2255-lishmor', '860-lishon'] + new_df = pd.DataFrame() + for url_suffix in url_suffixes: + url=f"https://www.pealim.com/dict/{url_suffix}" + cookies={'translit':'none', 'hebstyle' : 'bp', 'showmeaning' : 'off'} + html = requests.get(url, cookies=cookies) + df = pd.read_html(html.content)[0] + np_flat = df.iloc[:, 2:].values.flatten() + np_flat = np.delete(np_flat, [5,7,15,17,19,33,34,35]) + df_trim = pd.DataFrame([np_flat], columns=columns) + new_df = pd.concat([new_df, df_trim], ignore_index=True) + + new_df.to_csv('conjugations.csv', sep=';', index=True) + print(new_df.to_string()) + +extract_from_website() +