#!./bin/python3 import requests import pandas as pd def extract_from_website(): # Number of total pages of dictionary in pealim.com/dict/ # i.e. Number Of Words / 15 total_pages=608 df = pd.DataFrame() for page_num in range(1,total_pages): url=f"https://www.pealim.com/dict/?page={page_num}" cookies={'translit':'none', 'hebstyle' : 'mo'} html = requests.get(url, cookies=cookies).content df_list = pd.read_html(html) cookies={'translit': 'none', 'hebstyle':'vl', 'showmeaning' : 'off'} html = requests.get(url, cookies=cookies).content without_nikkud_words = pd.read_html(html)[-1]['Word'] without_nikkud_words = without_nikkud_words.rename('Word Without Nikkud') df_to_add = pd.concat([df_list[-1], without_nikkud_words], axis=1) df = pd.concat([df, df_to_add], ignore_index=True) #print(df) df.to_csv('pealim_dict.csv') def modify_for_anki(): df=pd.read_csv('pealim_dict.csv', index_col=0,dtype=str) shared_root_words = [] for i in range(0,df.shape[0]): root = df.Root.iloc[i] word = df.Word.iloc[i] if root != '-': shared_root_words.append(str(df[df.Root==root][df.Word != word].Word.values).replace('[','').replace(']','').replace('\'', '')) else: shared_root_words.append('') df['shared roots'] = shared_root_words # clean tags = [] for i in range(0,df.shape[0]): tag = "" root = df.iat[i,1] root = str(root).replace(' ', '').replace('-', '') if 'nan' in root or root == '': root = '' else: tag+=f"שורש::{root.replace('.','')} " part_of_speech = df.iat[i,2] if 'Adverb' in part_of_speech: tag += "תוארי_הפועל" elif 'Pronoun' in part_of_speech: tag += "כינויי_גוף" elif 'Noun' in part_of_speech: tag += "שם_עצם" elif 'Verb' in part_of_speech: tag += "פעלים" elif 'Adjective' in part_of_speech: tag += "שם_תואר" elif 'Preposition' in part_of_speech: tag += "מילות_יחס" elif 'Conjunction' in part_of_speech: tag += "מילות_חיבור" elif 'Particle' in part_of_speech: tag += "מילית" tags.append(tag) df.iat[i,1] = root df['tags'] = tags df.to_csv('pealim_dict_for_anki.csv', sep=';', index=True) print(df) extract_from_website() modify_for_anki()