72 lines
2.5 KiB
Python
Executable file
72 lines
2.5 KiB
Python
Executable file
#!./bin/python3
|
||
import requests
|
||
import pandas as pd
|
||
|
||
def extract_from_website():
|
||
# Number of total pages of dictionary in pealim.com/dict/
|
||
# i.e. Number Of Words / 15
|
||
total_pages=608
|
||
df = pd.DataFrame()
|
||
for page_num in range(1,total_pages):
|
||
url=f"https://www.pealim.com/dict/?page={page_num}"
|
||
cookies={'translit':'none', 'hebstyle' : 'mo'}
|
||
html = requests.get(url, cookies=cookies).content
|
||
df_list = pd.read_html(html)
|
||
cookies={'translit': 'none', 'hebstyle':'vl', 'showmeaning' : 'off'}
|
||
html = requests.get(url, cookies=cookies).content
|
||
without_nikkud_words = pd.read_html(html)[-1]['Word']
|
||
without_nikkud_words = without_nikkud_words.rename('Word Without Nikkud')
|
||
df_to_add = pd.concat([df_list[-1], without_nikkud_words], axis=1)
|
||
df = pd.concat([df, df_to_add], ignore_index=True)
|
||
#print(df)
|
||
df.to_csv('pealim_dict.csv')
|
||
|
||
def modify_for_anki():
|
||
df=pd.read_csv('pealim_dict.csv', index_col=0,dtype=str)
|
||
shared_root_words = []
|
||
for i in range(0,df.shape[0]):
|
||
root = df.Root.iloc[i]
|
||
word = df.Word.iloc[i]
|
||
if root != '-':
|
||
shared_root_words.append(str(df[df.Root==root][df.Word != word].Word.values).replace('[','').replace(']','').replace('\'', ''))
|
||
else:
|
||
shared_root_words.append('')
|
||
df['shared roots'] = shared_root_words
|
||
# clean
|
||
tags = []
|
||
for i in range(0,df.shape[0]):
|
||
tag = ""
|
||
root = df.iat[i,1]
|
||
root = str(root).replace(' ', '').replace('-', '')
|
||
if 'nan' in root or root == '':
|
||
root = ''
|
||
else:
|
||
tag+=f"שורש::{root.replace('.','')} "
|
||
|
||
part_of_speech = df.iat[i,2]
|
||
if 'Adverb' in part_of_speech:
|
||
tag += "תוארי_הפועל"
|
||
elif 'Pronoun' in part_of_speech:
|
||
tag += "כינויי_גוף"
|
||
elif 'Noun' in part_of_speech:
|
||
tag += "שם_עצם"
|
||
elif 'Verb' in part_of_speech:
|
||
tag += "פעלים"
|
||
elif 'Adjective' in part_of_speech:
|
||
tag += "שם_תואר"
|
||
elif 'Preposition' in part_of_speech:
|
||
tag += "מילות_יחס"
|
||
elif 'Conjunction' in part_of_speech:
|
||
tag += "מילות_חיבור"
|
||
elif 'Particle' in part_of_speech:
|
||
tag += "מילית"
|
||
tags.append(tag)
|
||
|
||
df.iat[i,1] = root
|
||
df['tags'] = tags
|
||
df.to_csv('pealim_dict_for_anki.csv', sep=';', index=True)
|
||
print(df)
|
||
|
||
extract_from_website()
|
||
modify_for_anki()
|
||
|