added extraction of verb conjugations
This commit is contained in:
parent
65bf3cd811
commit
f7ee692eed
2 changed files with 38 additions and 0 deletions
10
.gitignore
vendored
Normal file
10
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
archive
|
||||||
|
nikkud.csv
|
||||||
|
practice.py
|
||||||
|
cardinal_one_to_ten.*
|
||||||
|
*.swp
|
||||||
|
bin**
|
||||||
|
lib**
|
||||||
|
include**
|
||||||
|
lib64**
|
||||||
|
pyvenv.cfg
|
||||||
28
conjugation_extract.py
Executable file
28
conjugation_extract.py
Executable file
|
|
@ -0,0 +1,28 @@
|
||||||
|
#!./bin/python3
|
||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def extract_from_website():
|
||||||
|
# Number of total pages of dictionary in pealim.com/dict/
|
||||||
|
# i.e. Number Of Words / 15
|
||||||
|
columns = ['present ms', 'present fs', 'present mp' , 'present fp', 'past 1s', 'past 1p', 'past 2ms', 'past 2fs', 'past 2mp', 'past 2fp', 'past 3ms', 'past 3fs', 'past 3p', 'future 1s', 'future 1p', 'future 2ms', 'future 2fs', 'future 2mp', 'future 2fp', 'future 3ms', 'future 3fs', 'future 3mp', 'future 3fp', 'imperative ms', 'imperative fs', 'imperative mp', 'imperative fp', 'infinitive']
|
||||||
|
|
||||||
|
url_suffixes = ['2255-lishmor', '860-lishon']
|
||||||
|
new_df = pd.DataFrame()
|
||||||
|
for url_suffix in url_suffixes:
|
||||||
|
url=f"https://www.pealim.com/dict/{url_suffix}"
|
||||||
|
cookies={'translit':'none', 'hebstyle' : 'bp', 'showmeaning' : 'off'}
|
||||||
|
html = requests.get(url, cookies=cookies)
|
||||||
|
df = pd.read_html(html.content)[0]
|
||||||
|
np_flat = df.iloc[:, 2:].values.flatten()
|
||||||
|
np_flat = np.delete(np_flat, [5,7,15,17,19,33,34,35])
|
||||||
|
df_trim = pd.DataFrame([np_flat], columns=columns)
|
||||||
|
new_df = pd.concat([new_df, df_trim], ignore_index=True)
|
||||||
|
|
||||||
|
new_df.to_csv('conjugations.csv', sep=';', index=True)
|
||||||
|
print(new_df.to_string())
|
||||||
|
|
||||||
|
extract_from_website()
|
||||||
|
|
||||||
Loading…
Reference in a new issue