In [None]:
# Estimates the number of data series on HDX. Details here: 
# https://humanitarian.atlassian.net/wiki/spaces/HDX/pages/2796126209/Estimating+the+number+of+Data+Series+on+HDX

#Configuration
server = 'prod'

user = "hendrix"  # this just switches part of the working path depending on which machine I am using.

from hdx.hdx_configuration import Configuration
from hdx.location.country import Country
from hdx.data.vocabulary import Vocabulary
from hdx.data.resource import Resource
import pandas as pd
from IPython.display import clear_output
import numpy as np
from hdx.data.dataset import Dataset
from hdx.utilities.easy_logging import setup_logging
import requests
import re
import string
import datetime

override_now = None #needed for the copy/paste code below, no logic here to support changing it

In [None]:
# create a read-only configuration pointed at HDX production server  
Configuration.delete()
Configuration.create(preprefix='HDXINTERNAL',user_agent='tagbot',hdx_config_yaml='./config/'+server+'-tagbot.yml') # prod, test, feature, demo
#setup_logging()

In [None]:
with open('./config/tagbot_key.txt', 'r') as myfile:
    hdx_key = myfile.read()

In [None]:
# define elements form the hdx python country to use as inputs
info_to_use = ['#country+code+v_iso3',
               '#country+code+v_iso2',
              '#country+name+preferred',
              '#country+alt+name+v_fts',
              '#country+alt+name+v_hrinfo_country',
              '#country+name+short+v_reliefweb',
              '#country+alt+name+v_reliefweb',
               '#country+alt+i_en+name+v_unterm',
               '#country+alt+i_fr+name+v_unterm',
               '#country+alt+i_es+name+v_unterm',
               '#country+alt+i_ru+name+v_unterm',
               '#country+alt+i_zh+name+v_unterm',
               '#country+alt+i_ar+name+v_unterm',
               '#region+name+preferred+sub'
              ]

# returns a list of all words occuring in the hdx python country record for a given iso3
def compile_stop_words():
    stop_words = []
    all_countries = Country.countriesdata()['countries']
    for key in all_countries:
        for i in info_to_use:
            s = all_countries[key][i].lower()
            if s is not None:
                stop_words.extend(clean_and_split(s))
    stop_words.extend(['people','s','north','south','east','west','northeast','southeast','northwest','southwest','central'])
        #deduplicate the list
    return list(dict.fromkeys(stop_words))

# returns a list of words from the input string 
def clean_and_split(s):
    words = re.sub("[^\w\s]", "", s)
    #print(words)
    words = words.split()
    #print(words)
    return words

def clean_dataset_name(name):
    global stop_words
    name_words = clean_and_split(name)
    result_words = [word for word in name_words if word.lower() not in stop_words]
    return " ".join(result_words)

def write_report(df):
    df.to_csv('C:/Users/'+user+'/Google Drive/Work/OCHA/Projects/DAP/resource format cleanup/logs/'
                                +server+'_resource_format_report_'+now+'.csv',
                                index=False,encoding='utf-8',date_format='%Y-%m-%d')
    
def get_now():
    if override_now:
        return override_now
    else:
        now = datetime.datetime.now()
        return now.strftime("%Y-%m-%d-%H-%M")
    
def print_progress(c,z):
    clear_output()
    c+=1
    print(c)
    print('Progress: '+str(round((c/z)*100,0))+'%')
    return c

In [None]:
#####    MAIN    #####
now = get_now() 
print(now)

In [None]:
datasets = Dataset.get_all_datasets()

In [None]:
stop_words = compile_stop_words()

In [None]:
dataset_report = pd.DataFrame()

In [None]:
c = 0
z = len(datasets)
for dataset in datasets:
    dataset_name_stripped = dataset['name'].replace('-',' ')
    dataset_name_stripped = dataset_name_stripped.replace('_',' ')

    #process the name to strip out any words that might be part of a country name
    cleaned_name = clean_dataset_name(dataset_name_stripped)
    
    series_name = dataset['organization']['name']+"|"+cleaned_name
    row = {'series_name': series_name, 
           'org': dataset['organization']['name'], 
           'dataset': dataset['name'], 'dataset_title': dataset['title']}
    
    dataset_report = dataset_report.append(row,ignore_index=True)
    write_report(dataset_report)
    
    c = print_progress(c,z)
    
    