pealim/pealim_extract.py

#!./bin/python3
import requests
import pandas as pd

def extract_from_website():
    # Number of total pages of dictionary in pealim.com/dict/
    # i.e. Number Of Words / 15
    total_pages=608
    df = pd.DataFrame()
    for page_num in range(1,total_pages):
        url=f"https://www.pealim.com/dict/?page={page_num}"
        cookies={'translit':'none', 'hebstyle' : 'mo'}
        html = requests.get(url, cookies=cookies).content
        df_list = pd.read_html(html)
        cookies={'translit': 'none', 'hebstyle':'vl', 'showmeaning' : 'off'}
        html = requests.get(url, cookies=cookies).content
        without_nikkud_words = pd.read_html(html)[-1]['Word']
        without_nikkud_words = without_nikkud_words.rename('Word Without Nikkud')
        df_to_add = pd.concat([df_list[-1], without_nikkud_words], axis=1)
        df = pd.concat([df, df_to_add], ignore_index=True)
    #print(df)
    df.to_csv('pealim_dict.csv')

def modify_for_anki():
    df=pd.read_csv('pealim_dict.csv', index_col=0,dtype=str)
    shared_root_words = []
    for i in range(0,df.shape[0]):
        root = df.Root.iloc[i]
        word = df.Word.iloc[i]
        if root != '-':
            shared_root_words.append(str(df[df.Root==root][df.Word != word].Word.values).replace('[','').replace(']','').replace('\'', ''))
        else:
            shared_root_words.append('')
    df['shared roots'] = shared_root_words
    # clean
    tags = []
    for i in range(0,df.shape[0]):
        tag = ""
        root = df.iat[i,1]
        root = str(root).replace(' ', '').replace('-', '')
        if 'nan' in root or root == '':
            root = ''
        else:
            tag+=f"שורש::{root.replace('.','')} "

        part_of_speech = df.iat[i,2]
        if 'Adverb' in part_of_speech:
            tag += "תוארי_הפועל"
        elif 'Pronoun' in part_of_speech:
            tag += "כינויי_גוף"
        elif 'Noun' in part_of_speech:
            tag += "שם_עצם"
        elif 'Verb' in part_of_speech:
            tag += "פעלים"
        elif 'Adjective' in part_of_speech:
            tag += "שם_תואר"
        elif 'Preposition' in part_of_speech:
            tag += "מילות_יחס"
        elif 'Conjunction' in part_of_speech:
            tag += "מילות_חיבור"
        elif 'Particle' in part_of_speech:
            tag += "מילית"
        tags.append(tag)

        df.iat[i,1] = root
    df['tags'] = tags
    df.to_csv('pealim_dict_for_anki.csv', sep=';', index=True)
    print(df)

extract_from_website()
modify_for_anki()