pealim/pealim_extract.py
2024-06-08 21:15:20 -07:00

72 lines
2.5 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!./bin/python3
import requests
import pandas as pd
def extract_from_website():
# Number of total pages of dictionary in pealim.com/dict/
# i.e. Number Of Words / 15
total_pages=608
df = pd.DataFrame()
for page_num in range(1,total_pages):
url=f"https://www.pealim.com/dict/?page={page_num}"
cookies={'translit':'none', 'hebstyle' : 'mo'}
html = requests.get(url, cookies=cookies).content
df_list = pd.read_html(html)
cookies={'translit': 'none', 'hebstyle':'vl', 'showmeaning' : 'off'}
html = requests.get(url, cookies=cookies).content
without_nikkud_words = pd.read_html(html)[-1]['Word']
without_nikkud_words = without_nikkud_words.rename('Word Without Nikkud')
df_to_add = pd.concat([df_list[-1], without_nikkud_words], axis=1)
df = pd.concat([df, df_to_add], ignore_index=True)
#print(df)
df.to_csv('pealim_dict.csv')
def modify_for_anki():
df=pd.read_csv('pealim_dict.csv', index_col=0,dtype=str)
shared_root_words = []
for i in range(0,df.shape[0]):
root = df.Root.iloc[i]
word = df.Word.iloc[i]
if root != '-':
shared_root_words.append(str(df[df.Root==root][df.Word != word].Word.values).replace('[','').replace(']','').replace('\'', ''))
else:
shared_root_words.append('')
df['shared roots'] = shared_root_words
# clean
tags = []
for i in range(0,df.shape[0]):
tag = ""
root = df.iat[i,1]
root = str(root).replace(' ', '').replace('-', '')
if 'nan' in root or root == '':
root = ''
else:
tag+=f"שורש::{root.replace('.','')} "
part_of_speech = df.iat[i,2]
if 'Adverb' in part_of_speech:
tag += "תוארי_הפועל"
elif 'Pronoun' in part_of_speech:
tag += "כינוייוף"
elif 'Noun' in part_of_speech:
tag += "שם_עצם"
elif 'Verb' in part_of_speech:
tag += "פעלים"
elif 'Adjective' in part_of_speech:
tag += "שם_תואר"
elif 'Preposition' in part_of_speech:
tag += "מילות_יחס"
elif 'Conjunction' in part_of_speech:
tag += "מילות_חיבור"
elif 'Particle' in part_of_speech:
tag += "מילית"
tags.append(tag)
df.iat[i,1] = root
df['tags'] = tags
df.to_csv('pealim_dict_for_anki.csv', sep=';', index=True)
print(df)
extract_from_website()
modify_for_anki()