Update machine learning scripts with NNMF and TextRank-GloVe techniques

2021-09-07 18:18:47 +02:00
parent df623f39b9
commit 6d6888f201
4 changed files with 2445 additions and 119 deletions
--- a/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py
+++ b/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py
@@ -5,11 +5,63 @@
 """
-Related Participatory Budgeting projects and Tags - Dummy script
+Related Participatory Budgeting projects and Tags
 This script generates for each project: a) Tags, b) List of related projects.
 Running time: Max 2 hours for 10.000 projects.
 Technique used: NNMF and Euclidean distance between projects.
 More info in: https://github.com/consul-ml/consul-ml
 """
 # In[ ]:
 def check_file(file_name):
    if os.path.isfile(file_name):
        return
    else:
        try:
            logging.info('Missing file in Related Participatory Budgeting projects and Tags: ' + str(file_name))
        except NameError:
            print('No logging')            
        with open(os.path.join(data_path,taggings_filename), 'w') as file:
            file.write('[]')
        with open(os.path.join(data_path,tags_filename), 'w') as file:
            file.write('[]')
        with open(os.path.join(data_path,related_props_filename), 'w') as file:
            file.write('[]')
        os._exit(0)
 # In[ ]:
 # Input file:
 inputjsonfile = 'budget_investments.json'
 col_id = 'id'
 col_title = 'title'
 cols_content = ['title','description']
 # Output files:
 topics_tags_filename = 'ml_topics_tags_budgets.json'
 topics_tags_filename_csv = 'ml_topics_tags_budgets.csv'
 repr_prop_filename = 'ml_repr_budgets.json'
 repr_prop_filename_csv = 'ml_repr_budgets.csv'
 taggings_filename = 'ml_taggings_budgets.json'
 taggings_filename_csv = 'ml_taggings_budgets.csv'
 tags_filename = 'ml_tags_budgets.json'
 tags_filename_csv = 'ml_tags_budgets.csv'
 related_props_filename = 'ml_related_content_budgets.json'
 related_props_filename_csv = 'ml_related_content_budgets.csv'
 tqdm_notebook = True
 # In[2]:
@@ -17,70 +69,691 @@ data_path = '../data'
 config_file = 'budgets_related_content_and_tags_nmf.ini'
 logging_file ='budgets_related_content_and_tags_nmf.log'
 # Read the configuration file
 import os
 import configparser
 config = configparser.ConfigParser()
 check_file(os.path.join(data_path,config_file))
 config.read(os.path.join(data_path,config_file))
 stanza_model_lang = config['PREPROCESSING']['stanza_model_lang']
 stopwords_lang = config['PREPROCESSING']['stopwords_lang']
 noun_lemmatisation = config['PREPROCESSING'].getboolean('noun_lemmatisation')
 n_gram_min_count = config['PREPROCESSING'].getint('n_gram_min_count')
 stanza_download = config['PREPROCESSING'].getboolean('stanza_download')
 nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
 numb_related_proposals = config['RELATED_PROPOSALS'].getint('numb_related_proposals')
 numb_topics = config['TOPIC_MODELLING'].getint('numb_topics')
 numb_topkeywords_pertopic = config['TOPIC_MODELLING'].getint('numb_topkeywords_pertopic')
 n_top_represent_props = config['TOPIC_MODELLING'].getint('n_top_represent_props')
 n_features = config['TOPIC_MODELLING'].getint('n_features')
 min_df_val = config['TOPIC_MODELLING'].getfloat('min_df_val')
 max_df_val = config['TOPIC_MODELLING'].getfloat('max_df_val')
 logging_level = config['LOGGING']['logging_level']
 # In[3]:
-# Input file:
+related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)]
 inputjsonfile = 'budget_investments.json'
-# Output files:
+repr_prop_cols = ['topic_id','proposal_id','title']
-taggings_filename = 'ml_taggings_budgets.json'
+tags_file_cols = ['id','name','taggings_count','kind']
-tags_filename = 'ml_tags_budgets.json'
+taggings_file_cols = ['tag_id','taggable_id','taggable_type']
-related_props_filename = 'ml_related_content_budgets.json'
+tag_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)]
 tags_file_cols_count = 'taggings_count'
 taggings_file_cols_id = 'tag_id'
 # In[4]:
-import os
+import logging
 import pandas as pd
 logging.basicConfig(filename=os.path.join(data_path,logging_file), 
                    filemode='w', 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    level=logging_level)
 #logging.info('message')
 # ### Read the proposals
 # In[5]:
-# proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
+import os
-# col_id = 'id'
+import re
-# cols_content = ['title','description']
+import numpy as np
-# proposals_input_df = proposals_input_df[[col_id]+cols_content]
+import pandas as pd
 from unicodedata import normalize 
 import sys
 # ### Create file: Taggings. Each line is a Tag associated to a Proposal
 # In[6]:
-taggings_file_cols = ['tag_id','taggable_id','taggable_type']
+import stanza
-taggings_file_df = pd.DataFrame(columns=taggings_file_cols)
+if stanza_download:
-row = [0,1,'Budget::Investment']
+    stanza.download(stanza_model_lang)
 taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True)
 taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False)
 # IF NEEDED define 'pos_batch_size': 10000 in the next cell, config options.
 config = {
        'processors': 'tokenize,mwt,pos,lemma',
        'lang': stanza_model_lang
         }
 #not using depparse
 nlp = stanza.Pipeline(**config) 
 # ### Create file: Tags. List of Tags with the number of times they have been used
 # In[7]:
-tags_file_cols = ['id','name','taggings_count','kind']
+import tqdm
-tags_file_df = pd.DataFrame(columns=tags_file_cols)
+from tqdm.notebook import tqdm_notebook
-row = [0,'tag',0,'']
+tqdm_notebook.pandas()
-tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True)
+# to use tqdm in pandas use progress_apply instead of apply
 tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False)
 # ### Create file: List of related proposals
 # In[8]:
-numb_related_proposals = 2
+import nltk
-related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)]
+if nltk_download:
-related_props_df = pd.DataFrame(columns=related_props_cols)
+    nltk.download('stopwords')
-row = [1]+['' for num in range(1,numb_related_proposals+1)]
+    nltk.download('punkt')
-related_props_df = related_props_df.append(dict(zip(related_props_cols,row)), ignore_index=True)
+
-related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False)
+from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 from nltk.tokenize import word_tokenize, sent_tokenize
 # In[9]:
 import gensim
 from gensim.models.phrases import Phrases, Phraser
 # In[10]:
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import NMF
 # In[ ]:
 # In[ ]:
 # # Read the proposals and join the content to use in the topic modelling
 # In[ ]:
 check_file(os.path.join(data_path,inputjsonfile))
 proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
 proposals_input_df = proposals_input_df[[col_id]+cols_content]
 # In[ ]:
 # TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
 if len(proposals_input_df) == 0:
    logging.info('No Proposals found.')
    with open(os.path.join(data_path,taggings_filename), 'w') as file:
        file.write('[]')
    with open(os.path.join(data_path,tags_filename), 'w') as file:
        file.write('[]')
    with open(os.path.join(data_path,related_props_filename), 'w') as file:
        file.write('[]')
    os._exit(0)
 # In[11]:
 # Normalise characters
 for col in cols_content:
    proposals_input_df[col] = proposals_input_df[col].apply(lambda x: normalize('NFKC',x))
 proposals_input_df['joined_content'] = proposals_input_df[cols_content].agg('\n'.join, axis=1)
 proposals_input_df = proposals_input_df.drop(columns=list(set(cols_content)-{col_title}))
 # In[ ]:
 # # Lemmatise the content
 # In[12]:
 proposals_input_df['joined_content_topicmodelling'] = proposals_input_df['joined_content']
 # In[13]:
 # Using Stanza from Stanford NLP group
 def content_processing_for_topicmodelling_1(txt):
    # Delete html tags and urls
    tmp_txt = re.sub("<[^<]+?>","",txt)
    tmp_txt = re.sub(r"http[^\s]+?\s","",tmp_txt)
    tmp_txt = re.sub(r"http[^\s]+?$","",tmp_txt)
    tmp_txt = re.sub(r"www[^\s]+?\s","",tmp_txt)
    tmp_txt = re.sub(r"www[^\s]+?$","",tmp_txt)
    # Tokenise, lemmatise and select only the nouns
    new_txt_tok = []
    if len(re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ]","",tmp_txt).rstrip("\n")) != 0:
        tmp_txt_nlp = nlp(tmp_txt)
        for sent in tmp_txt_nlp.sentences:
            for token in sent.words:
                if noun_lemmatisation:
                    if token.upos == 'NOUN':
                        new_txt_tok.append(token.lemma)
                else:
                    new_txt_tok.append(token.text)
    return new_txt_tok
 # In[14]:
 if tqdm_notebook:
    proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
        'joined_content_topicmodelling'].progress_apply(content_processing_for_topicmodelling_1)
 else:
    proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
        'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_1)
 # In[ ]:
 #  
 # # Clean the data
 # In[15]:
 # Includes some extra steps for Spanish
 # List of stop words to be removed
 stop_words = set(stopwords.words(stopwords_lang))
 if stopwords_lang == 'spanish':
    for word in stop_words:
        stop_words = stop_words.union({re.sub(r"á","a",word)})
        stop_words = stop_words.union({re.sub(r"é","e",word)})
        stop_words = stop_words.union({re.sub(r"í","i",word)})
        stop_words = stop_words.union({re.sub(r"ó","o",word)})
        stop_words = stop_words.union({re.sub(r"ú","u",word)})
 # additional terms removed when found as an independent character
 if stopwords_lang == 'spanish':
    additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
 else:
    additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
 all_stop_words = stop_words.union(additional_stop_words)
 # In[16]:
 def content_processing_for_topicmodelling_2(txt_tok):    
    new_text_tok = []
    for word in txt_tok:
        new_word = word.lower()
        new_word = re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ\s]","",new_word)
        new_word = re.sub(r"[0-9]+","",new_word)
        new_word = new_word.rstrip("\n")
        if (len(new_word) != 0) and (new_word not in all_stop_words):
            new_text_tok.append(new_word)
    return new_text_tok
 # In[17]:
 proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
    'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_2)
 # In[ ]:
 # # Detect n-grams
 # In[18]:
 txt_unigram = proposals_input_df['joined_content_topicmodelling'].tolist()
 phrases_bigrams = Phrases(txt_unigram, min_count=n_gram_min_count)
 txt_bigram = [phrases_bigrams[txt] for txt in txt_unigram]
 txt_bigram_joined = [' '.join(txt) for txt in txt_bigram]
 # may contain also cuadrigrams when joining 2 bigrams:
 # phrases_trigrams = Phrases(txt_bigram, min_count=n_gram_min_count)
 # txt_trigram = [phrases_trigrams[txt] for txt in txt_bigram]
 # txt_trigram_joined = [' '.join(txt) for txt in txt_trigram]
 proposals_input_df['joined_content_topicmodelling'] = txt_bigram_joined
 # proposals_input_df['joined_content_topicmodelling'] = txt_trigram_joined
 # In[ ]:
 # In[ ]:
 # # Topic modelling (NMF)
 # In[19]:
 df_col_to_use = proposals_input_df['joined_content_topicmodelling']
 # NUMBER OF TOPICS
 n_components = numb_topics
 # SELECT the TOP n_top_words WORDS for each topic
 n_top_words = numb_topkeywords_pertopic
 # Use tf-idf features for NMF
 tfidf_vectorizer = TfidfVectorizer(max_df=max_df_val, min_df=min_df_val,
                                   max_features=n_features)
 tfidf = tfidf_vectorizer.fit_transform(df_col_to_use.tolist())
 # In[20]:
 # Includes some extra steps for Spanish
 def cleaning_features(top_features):
    clean_features = top_features.copy()
    for feature in clean_features:
        if feature+'s' in clean_features: clean_features[max(
            clean_features.index(feature),clean_features.index(feature+'s'))] = ''
    if stopwords_lang == 'spanish':
        for feature in clean_features:
            if feature+'es' in clean_features: clean_features[max(
                clean_features.index(feature),clean_features.index(feature+'es'))] = ''
        for feature in clean_features:
            if feature+'r' in clean_features: clean_features[max(
                clean_features.index(feature),clean_features.index(feature+'r'))] = ''       
    nosign_features = clean_features.copy()
    if stopwords_lang == 'spanish':
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"á","a",fet)
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"é","e",fet)
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"í","i",fet)
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"ó","o",fet)
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"ú","u",fet)  
    for pos,fet in enumerate(nosign_features):
        if fet in nosign_features[pos+1:]:
            clean_features[max(pos_2 for pos_2,fet_2 in enumerate(nosign_features) if fet_2 == fet)] = ''
    return clean_features       
 # Fit the NMF model
 nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
 # nmf.components_ is the H matrix 
 # W = nmf.fit_transform(tfidf)
 tfidf_feature_names = tfidf_vectorizer.get_feature_names()
 # Size of the vocabulary and the nmf matrix
 #print(len(tfidf_vectorizer.vocabulary_))
 #print(len(tfidf_feature_names))
 #nmf.components_.shape
 # In[ ]:
 # ### Create file: Repr_Prop. Most representative proposal for each topic
 # In[21]:
 W = nmf.fit_transform(tfidf)
 #print(W.shape)
 repr_prop_df = pd.DataFrame(columns=repr_prop_cols)
 for topic_index in range(n_components):
    top_indices = np.argsort( W[:,topic_index] )[::-1]
    top_represent_proposals = []
    for proposal_index in top_indices[0:n_top_represent_props]:
        top_represent_proposals.append(proposal_index)
    for prop_internal_index in top_represent_proposals:
        row = [topic_index,
              proposals_input_df.loc[int(prop_internal_index),'id'],
              proposals_input_df.loc[int(prop_internal_index),'title']]
        repr_prop_df = repr_prop_df.append(dict(zip(repr_prop_cols,row)), ignore_index=True)
 # In[22]:
 repr_prop_df.to_json(os.path.join(data_path,repr_prop_filename),orient="records", force_ascii=False)
 repr_prop_df.to_csv(os.path.join(data_path,repr_prop_filename_csv), index=False)
 # In[ ]:
 # ### Create file: Topics_Tags. List of Topics with their top Tags
 # In[23]:
 topics_tags_df = pd.DataFrame(columns=['id']+tag_cols)
 # nmf.components_ is the H matrix 
 for topic_idx, topic in enumerate(nmf.components_):
    obj_temp = [tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    clean_obj_temp = cleaning_features(obj_temp)
    clean_obj_temp.insert(0, str(topic_idx))
    #print(clean_obj_temp)
    topics_tags_df = topics_tags_df.append(dict(zip(['id']+tag_cols,clean_obj_temp)), ignore_index=True)
 # In[24]:
 topics_tags_df.to_json(os.path.join(data_path,topics_tags_filename),orient="records", force_ascii=False)
 topics_tags_df.to_csv(os.path.join(data_path,topics_tags_filename_csv), index=False)
 # In[ ]:
 # ### Create file: Taggings. Each line is a Tag associated to a Proposal
 # In[25]:
 # Coefficients for following calculation
 tag_coefs_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)]
 topics_tags_coefs_df = pd.DataFrame(columns=['id']+tag_coefs_cols)
 # nmf.components_ is the H matrix 
 for topic_idx, topic in enumerate(nmf.components_):
    topics_tags_coefs_temp = []
    topics_tags_coefs_temp.append(int(topic_idx))
    for i in topic.argsort()[:-n_top_words - 1:-1]:
        topics_tags_coefs_temp.append(topic[i])
    topics_tags_coefs_df = topics_tags_coefs_df.append(dict(zip(['id']+tag_coefs_cols,
                                                                topics_tags_coefs_temp)), ignore_index=True)
 for col in tag_cols:
    for topic_idx,topic in enumerate(topics_tags_df[col].tolist()):
        if topic == '':
            topics_tags_coefs_df.loc[int(topic_idx),col] = 0.0
 # In[26]:
 topics_tags_flat = []
 for idx,topic in topics_tags_df.iterrows():
    topics_tags_flat = topics_tags_flat + topics_tags_df.loc[idx,tag_cols].tolist()
 topics_tags_coefs_flat = []
 for idx,topic in topics_tags_coefs_df.iterrows():
    topics_tags_coefs_flat = topics_tags_coefs_flat + topics_tags_coefs_df.loc[idx,tag_coefs_cols].tolist()
 # In[27]:
 taggings_file_df = pd.DataFrame(columns=taggings_file_cols)
 for prop_idx,prop in tqdm.tqdm(enumerate(W),total=len(W)):
    proposal_topics_temp = np.zeros((len(topics_tags_flat)))
    cont = 0
    for weight in prop:
        for n in range(n_top_words):
            proposal_topics_temp[cont] = weight
            cont += 1
    proposal_tags_temp = proposal_topics_temp*topics_tags_coefs_flat
    # Adding the coefficients of same tags:
    for numterm_a,term_a in enumerate(topics_tags_flat):
        for numterm_b in reversed(range(numterm_a+1,len(topics_tags_flat))):
            term_b = topics_tags_flat[numterm_b]
            if (term_a == term_b):                
                proposal_tags_temp[numterm_a] = proposal_tags_temp[numterm_a] + proposal_tags_temp[numterm_b]
                proposal_tags_temp[numterm_b] = 0   
    for i in proposal_tags_temp.argsort()[:-n_top_words - 1:-1]:
        row = [i,proposals_input_df.loc[prop_idx,'id'],'Budget::Investment']        
        taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True)
 # ### Create file: Tags. List of Tags with the number of times they have been used
 # In[28]:
 tags_file_df = pd.DataFrame(columns=tags_file_cols)
 for tag_id,tag in enumerate(topics_tags_flat):
    row = [tag_id,tag,0,'']
    tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True)
 for tag_id in taggings_file_df[taggings_file_cols_id].tolist():
    tags_file_df.loc[tag_id,tags_file_cols_count] = tags_file_df.loc[tag_id,tags_file_cols_count]+1
 # ### Deleting duplicate tags from files Tag and Taggings before saving them
 # In[29]:
 change_rows = []
 repeated_ids = []
 for idx1,row1 in tags_file_df.iterrows():
    for idx2,row2 in tags_file_df.iterrows():
        if (idx2 > idx1) and (idx2 not in repeated_ids) and (row1['name'] == row2['name']):
            change_rows.append((idx1,idx2))
            repeated_ids.append(idx2)
 tags_file_df = tags_file_df.drop(repeated_ids)
 # In[30]:
 for c_row in change_rows:
    taggings_file_df['tag_id'] = taggings_file_df['tag_id'].apply(lambda x: c_row[0] if x == c_row[1] else x)
 # In[31]:
 tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False)
 tags_file_df.to_csv(os.path.join(data_path,tags_filename_csv), index=False)
 # In[32]:
 taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False)
 taggings_file_df.to_csv(os.path.join(data_path,taggings_filename_csv), index=False)
 # In[ ]:
 # In[33]:
 # proposals_input_df
 # In[34]:
 # repr_prop_df
 # In[35]:
 # topics_tags_df
 # In[36]:
 # taggings_file_df
 # In[37]:
 # tags_file_df
 # In[ ]:
 # # LIST OF RELATED PROPOSALS
 # In[38]:
 proposal_topics_coefs_cols = ['id','topic_coefs']
 proposal_topics_coefs_df = pd.DataFrame(columns=proposal_topics_coefs_cols)
 for prop_idx,prop in enumerate(W):
    row = [proposals_input_df.loc[prop_idx,'id'],prop.copy()]
    proposal_topics_coefs_df = proposal_topics_coefs_df.append(dict(zip(proposal_topics_coefs_cols,row)),
                                                               ignore_index=True)
 # In[39]:
 related_props_df = pd.DataFrame(columns=related_props_cols)
 for idx,row in tqdm.tqdm(proposal_topics_coefs_df.iterrows(),total=len(proposal_topics_coefs_df)):
    prop_related_temp = []
    prop_related_temp.append(int(row['id']))
    vectora = row['topic_coefs']
    distances = [np.linalg.norm(vectora-vectorb) for vectorb in proposal_topics_coefs_df['topic_coefs'].tolist()]
    # the vector contains also the id of the initial proposal, thus numb_related_proposals+1
    for i in np.array(distances).argsort()[0:numb_related_proposals+1]:
        if distances[i] != 0.0:
            prop_related_temp.append(int(proposals_input_df.loc[i,'id']))
    # in case there are less related proposals than the max number
    while len(prop_related_temp) < numb_related_proposals+1:
        prop_related_temp.append('')
    related_props_df = related_props_df.append(dict(zip(related_props_cols,prop_related_temp)), ignore_index=True)
 # In[40]:
 related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False)
 related_props_df.to_csv(os.path.join(data_path,related_props_filename_csv), index=False)
 # In[ ]:
 # In[41]:
 #proposal_topics_coefs_df
 # In[42]:
 #related_props_df
 # In[43]:
 logging.info('Script executed correctly.')
 # In[ ]:
--- a/public/machine_learning/scripts/budgets_summary_comments_textrank.py
+++ b/public/machine_learning/scripts/budgets_summary_comments_textrank.py
@@ -1,59 +1,549 @@
 #!/usr/bin/env python
 # coding: utf-8
-# In[1]:
+# In[ ]:
 """
-Participatory Budgeting comments summaries - Dummy script
+Participatory Budgeting comments summaries
 This script generates for each budget project a summary of all its comments.
 Running time: Max 1 hour for 10.000 proposals.
 Technique used: GloVe embeddings and TextRank.
 More info in: https://github.com/consul-ml/consul-ml
 """
-# In[2]:
+# In[ ]:
 # DOWNLOAD THE GLOVE EMBEDDINGS, IN THE DATA FOLDER:
 # ENGLISH:
 #!wget https://nlp.stanford.edu/data/glove.6B.zip
 #!gunzip glove.6B.zip
 # SPANISH:
 #!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz
 #!gunzip glove-sbwc*.gz 
 # In[ ]:
 def check_file(file_name):
    if os.path.isfile(file_name):
        return
    else:
        try:
            logging.info('Missing file in Participatory Budgeting comments summaries: ' + str(file_name))
        except NameError:
            print('No logging')
        with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
            file.write('[]')
        os._exit(0)
 # In[ ]:
 # Input file:
 inputjsonfile = 'comments.json'
 col_id = 'commentable_id'
 col_content = 'body'
 # Output files:
 comments_summaries_filename = 'ml_comments_summaries_budgets.json'
 comments_summaries_filename_csv = 'ml_comments_summaries_budgets.csv'
 tqdm_notebook = True
 # In[ ]:
 data_path = '../data'
 config_file = 'budgets_summary_comments_textrank.ini'
 logging_file ='budgets_summary_comments_textrank.log'
 # Read the configuration file
 import os
 import configparser
 config = configparser.ConfigParser()
 check_file(os.path.join(data_path,config_file))
 config.read(os.path.join(data_path,config_file))
-# In[3]:
+sent_token_lang = config['PREPROCESSING']['sent_token_lang']
 stopwords_lang = config['PREPROCESSING']['stopwords_lang']
 nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
 if stopwords_lang == 'spanish':
    glove_file = config['SUMMARISATION']['glove_file_es']
 if stopwords_lang == 'english':
    glove_file = config['SUMMARISATION']['glove_file_en']    
 threshold_factor = config['SUMMARISATION'].getfloat('threshold_factor')
 max_size_of_summaries = config['SUMMARISATION'].getint('max_size_of_summaries')
 logging_level = config['LOGGING']['logging_level']
-# Input file:
+# In[ ]:
 inputjsonfile = 'comments.json'
 # Output files:
 comments_summaries_filename = 'ml_comments_summaries_budgets.json'
-# In[4]:
+import logging
 logging.basicConfig(filename=os.path.join(data_path,logging_file), 
                    filemode='w', 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    level=logging_level)
 #logging.info('message')
 # In[ ]:
 import os
 import pandas as pd
 import numpy as np
 import re
 from unicodedata import normalize
 import sys
-# ### Read the comments
+# In[ ]:
 # In[5]:
-# comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
+import nltk
-# col_id = 'commentable_id'
+if nltk_download:
-# col_content = 'body'
+    nltk.download('stopwords')
-# comments_input_df = comments_input_df[[col_id]+[col_content]]
+    nltk.download('punkt')
-# ### Create file. Comments summaries
+# In[ ]:
-
+
-# In[6]:
+
 from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 from nltk.tokenize import word_tokenize, sent_tokenize
 # In[ ]:
 from gensim.scripts.glove2word2vec import glove2word2vec
 from gensim.models import KeyedVectors
 # In[ ]:
 from sklearn.metrics.pairwise import cosine_similarity
 import networkx as nx
 import collections
 # In[ ]:
 import tqdm
 from tqdm.notebook import tqdm_notebook
 tqdm_notebook.pandas()
 # to use tqdm in pandas use progress_apply instead of apply
 # In[ ]:
 # Different code for Spanish and English vectors
 # Extract word vectors
 check_file(os.path.join(data_path,glove_file))
 if stopwords_lang == 'english':
    non_keyed_embs = os.path.join(data_path,glove_file)
    keyed_embs = os.path.join(data_path,glove_file+'.vec')
    if (not os.path.isfile(keyed_embs)) and (os.path.isfile(non_keyed_embs)):
        glove2word2vec(non_keyed_embs, keyed_embs)
    glove_file = glove_file+'.vec'
 word_embeddings = {}
 f = open(os.path.join(data_path,glove_file), encoding='utf-8')
 for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
 f.close()
 # In[ ]:
 # # Read the comments and join the comments belonging to the same proposal
 # In[ ]:
 check_file(os.path.join(data_path,inputjsonfile))
 comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
 comments_input_df = comments_input_df[comments_input_df['commentable_type'] == 'Budget::Investment']
 # In[ ]:
 # TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
 if len(comments_input_df) == 0:
    logging.info('No Participatory Budgeting comments found to summarise.')
    with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
        file.write('[]')
    os._exit(0)
 # In[ ]:
 comments_input_df = comments_input_df[[col_id]+[col_content]]
 # Normalise characters
 comments_input_df[col_content] = comments_input_df[col_content].apply(lambda x: normalize('NFKC',x))
 comments_input_df = comments_input_df.sort_values(by=col_id)
 comments_input_df.reset_index(drop=True,inplace=True)
 # In[ ]:
 # Drop empty texts
 empty_txt_ids = []
 for idx,row in comments_input_df.iterrows():
    if row['body'].strip() == '':
        empty_txt_ids.append(idx)
 comments_input_df = comments_input_df.drop(empty_txt_ids)
 comments_input_df.reset_index(drop=True,inplace=True)
 # In[ ]:
 comments_df = pd.DataFrame()
 temp_comments_joined = []
 temp_comments_number = []
 temp_proposal_id = []
 for prop_id in sorted(list(set(comments_input_df[col_id].tolist()))):
    temp_list = comments_input_df[comments_input_df[col_id] == prop_id][col_content].tolist()
    temp_comments_joined.append('\n'.join(temp_list))
    temp_comments_number.append(len(temp_list))
    temp_proposal_id.append(prop_id)
 comments_df['prop_id'] = temp_proposal_id
 comments_df['comments_joined'] = temp_comments_joined
 comments_df['comments_number'] = temp_comments_number
 # In[ ]:
 # # Stats
 # print(len(comments_df))
 # print(len(comments_df[(comments_df['comments_number'] >= 0) & (comments_df['comments_number'] < 10)]))
 # print(len(comments_df[(comments_df['comments_number'] >= 10) & (comments_df['comments_number'] < 50)]))
 # print(len(comments_df[(comments_df['comments_number'] >= 50) & (comments_df['comments_number'] < 900)]))
 # In[ ]:
 # # Make comments lowercase
 # In[ ]:
 comments_df['comments_joined'] = comments_df['comments_joined'].apply(lambda x: x.lower())
 # In[ ]:
 # # Split sentences
 # In[ ]:
 def split_sentences(txt):
    new_text_1 = sent_tokenize(txt,sent_token_lang)
    #outputs [] if txt is ''; or made of ' ' or '\n'
    new_text_2 = []    
    if new_text_1 != []:
        for tok1 in new_text_1:
            new_text_2 += tok1.split('\n')
            #outputs [''] if txt is ''
        new_text_2 = [tok.strip() for tok in new_text_2 if tok.strip() != '']
    if new_text_2 == []:
        new_text_2 = ['']
    return new_text_2
 # In[ ]:
 comments_df['comments_sentences'] = comments_df['comments_joined'].apply(split_sentences)
 # In[ ]:
 # # Calculate sentence embeddings
 # In[ ]:
 # Includes some extra steps for Spanish
 # List of stop words to be removed
 stop_words = set(stopwords.words(stopwords_lang))
 if stopwords_lang == 'spanish':
    for word in stop_words:
        stop_words = stop_words.union({re.sub(r"á","a",word)})
        stop_words = stop_words.union({re.sub(r"é","e",word)})
        stop_words = stop_words.union({re.sub(r"í","i",word)})
        stop_words = stop_words.union({re.sub(r"ó","o",word)})
        stop_words = stop_words.union({re.sub(r"ú","u",word)})
 # additional terms removed when found as an independent character
 if stopwords_lang == 'spanish':
    additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
 else:
    additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
 all_stop_words = stop_words.union(additional_stop_words)
 # In[ ]:
 def sentences_embeddings(sents):
    sent_embs = []
    for sent in sents:           
        words = set(word_tokenize(sent))
        words = words-all_stop_words
        if len(words) != 0:
            emb = sum([word_embeddings.get(word, np.zeros(300)) for word in words])/(
                len(words)+0.001)
        else:
            emb = np.zeros(300)
        sent_embs.append(emb)
    return sent_embs
 # In[ ]:
 if tqdm_notebook:
    comments_df['comments_sentences_embeddings'] = comments_df[
        'comments_sentences'].progress_apply(sentences_embeddings)
 else:
    comments_df['comments_sentences_embeddings'] = comments_df[
        'comments_sentences'].apply(sentences_embeddings)
 # In[ ]:
 # # Calculate sentence scores
 # In[ ]:
 def sentences_scores(sents, sent_embs): 
    # similarity matrix
    if len(sent_embs) > 1:
        stacked_sent_embs = np.stack(sent_embs)
        sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
        np.fill_diagonal(sim_mat, 0)
    elif len(sent_embs) == 1:
        sim_mat = np.array([[0.]])
    else:
        return collections.OrderedDict([('',1.0)])
    nx_graph = nx.from_numpy_array(sim_mat)
    try:
        sentence_weight_temp = nx.pagerank(nx_graph)
    except:
        sentence_weight_temp = dict.fromkeys([x for x in range(len(sents))], 0)
    sentence_weights = {sents[key]: value for key, value in sentence_weight_temp.items()}
    sorted_sentence_weights = sorted(sentence_weights.items(), key=lambda elem: elem[1], reverse=True)
    sentence_scores = collections.OrderedDict(sorted_sentence_weights)
    return sentence_scores
 # In[ ]:
 def plot_sentences_network(sents, sent_embs):
    import matplotlib.pyplot as plt
    # similarity matrix
    if len(sent_embs) > 1:
        stacked_sent_embs = np.stack(sent_embs)
        sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
        np.fill_diagonal(sim_mat, 0)
    elif len(sent_embs) == 1:
        sim_mat = np.array([[0.]])
    else:
        print('Nothing to plot')
        return
    nx_graph = nx.from_numpy_array(sim_mat)
    plt.plot()
    nx.draw(nx_graph, with_labels=True)
 # In[ ]:
 comments_df['comments_sentences_scores'] = comments_df[['comments_sentences','comments_sentences_embeddings']].progress_apply(
    lambda row: sentences_scores(row['comments_sentences'],row['comments_sentences_embeddings']),axis=1)
 # In[ ]:
 # # Generate the summaries
 # In[ ]:
 def comments_summary(sentence_weight, threshold_factor, *totalwords):
    threshold = threshold_factor * np.mean(list(sentence_weight.values()))
    sentence_counter = 0
    comments_summary = ''
    summary_num_words = 0   
    for sentence in sentence_weight:
        if sentence_weight[sentence] >= (threshold):
            if len(totalwords) == 0:
                comments_summary += "\n- " + sentence
                sentence_counter += 1
            elif summary_num_words < totalwords[0]:
                comments_summary += "\n- " + sentence
                sentence_counter += 1
                summary_num_words += len(sentence.split())
    comments_summary = comments_summary.lstrip()
    return comments_summary
 # In[ ]:
 comments_df['comments_summary'] = comments_df['comments_sentences_scores'].apply(
    lambda x: comments_summary(x,threshold_factor,max_size_of_summaries))
 # In[ ]:
 # comments_df
 # In[ ]:
 # for idx,row in comments_input_df[comments_input_df['commentable_id'] == 10].iterrows():
 #     print(row['body'])
 #     print('-------')
 # In[ ]:
 #print(comments_df.loc[8,'comments_summary'])
 # In[ ]:
 # In[ ]:
 comments_df['commentable_type'] = ['Budget::Investment']*len(comments_df)
 comments_summaries_df = comments_df[['prop_id','commentable_type','comments_summary']]
 comments_summaries_df.reset_index(level=0, inplace=True)
 # In[ ]:
 comments_summaries_df = comments_summaries_df.rename(
    columns={"index": "id", "prop_id": "commentable_id", "comments_summary": "body"})
 # In[ ]:
 #comments_summaries_df
 # In[ ]:
 comments_summaries_cols = ['id','commentable_id','commentable_type','body']
 comments_summaries_df = pd.DataFrame(columns=comments_summaries_cols)
 row = [0,0,'Budget::Investment','Summary']
 comments_summaries_df = comments_summaries_df.append(dict(zip(comments_summaries_cols,row)), ignore_index=True)
 comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False)
 comments_summaries_df.to_csv(os.path.join(data_path,comments_summaries_filename_csv), index=False)
 # In[ ]:
 # In[ ]:
 # In[ ]:
 logging.info('Script executed correctly.')
--- a/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py
+++ b/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py
@@ -5,11 +5,63 @@
 """
-Related Proposals and Tags - Dummy script
+Related Proposals and Tags
 This script generates for each proposal: a) Tags, b) List of related proposals.
 Running time: Max 2 hours for 10.000 proposals.
 Technique used: NNMF and Euclidean distance between proposals.
 More info in: https://github.com/consul-ml/consul-ml
 """
 # In[ ]:
 def check_file(file_name):
    if os.path.isfile(file_name):
        return
    else:
        try:
            logging.info('Missing file in Related Proposals and Tags: ' + str(file_name))
        except NameError:
            print('No logging')            
        with open(os.path.join(data_path,taggings_filename), 'w') as file:
            file.write('[]')
        with open(os.path.join(data_path,tags_filename), 'w') as file:
            file.write('[]')
        with open(os.path.join(data_path,related_props_filename), 'w') as file:
            file.write('[]')
        os._exit(0)
 # In[ ]:
 # Input file:
 inputjsonfile = 'proposals.json'
 col_id = 'id'
 col_title = 'title'
 cols_content = ['title','description','summary']
 # Output files:
 topics_tags_filename = 'ml_topics_tags_proposals.json'
 topics_tags_filename_csv = 'ml_topics_tags_proposals.csv'
 repr_prop_filename = 'ml_repr_proposals.json'
 repr_prop_filename_csv = 'ml_repr_proposals.csv'
 taggings_filename = 'ml_taggings_proposals.json'
 taggings_filename_csv = 'ml_taggings_proposals.csv'
 tags_filename = 'ml_tags_proposals.json'
 tags_filename_csv = 'ml_tags_proposals.csv'
 related_props_filename = 'ml_related_content_proposals.json'
 related_props_filename_csv = 'ml_related_content_proposals.csv'
 tqdm_notebook = True
 # In[2]:
@@ -17,70 +69,691 @@ data_path = '../data'
 config_file = 'proposals_related_content_and_tags_nmf.ini'
 logging_file ='proposals_related_content_and_tags_nmf.log'
 # Read the configuration file
 import os
 import configparser
 config = configparser.ConfigParser()
 check_file(os.path.join(data_path,config_file))
 config.read(os.path.join(data_path,config_file))
 stanza_model_lang = config['PREPROCESSING']['stanza_model_lang']
 stopwords_lang = config['PREPROCESSING']['stopwords_lang']
 noun_lemmatisation = config['PREPROCESSING'].getboolean('noun_lemmatisation')
 n_gram_min_count = config['PREPROCESSING'].getint('n_gram_min_count')
 stanza_download = config['PREPROCESSING'].getboolean('stanza_download')
 nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
 numb_related_proposals = config['RELATED_PROPOSALS'].getint('numb_related_proposals')
 numb_topics = config['TOPIC_MODELLING'].getint('numb_topics')
 numb_topkeywords_pertopic = config['TOPIC_MODELLING'].getint('numb_topkeywords_pertopic')
 n_top_represent_props = config['TOPIC_MODELLING'].getint('n_top_represent_props')
 n_features = config['TOPIC_MODELLING'].getint('n_features')
 min_df_val = config['TOPIC_MODELLING'].getfloat('min_df_val')
 max_df_val = config['TOPIC_MODELLING'].getfloat('max_df_val')
 logging_level = config['LOGGING']['logging_level']
 # In[3]:
-# Input file:
+related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)]
 inputjsonfile = 'proposals.json'
-# Output files:
+repr_prop_cols = ['topic_id','proposal_id','title']
-taggings_filename = 'ml_taggings_proposals.json'
+tags_file_cols = ['id','name','taggings_count','kind']
-tags_filename = 'ml_tags_proposals.json'
+taggings_file_cols = ['tag_id','taggable_id','taggable_type']
-related_props_filename = 'ml_related_content_proposals.json'
+tag_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)]
 tags_file_cols_count = 'taggings_count'
 taggings_file_cols_id = 'tag_id'
 # In[4]:
-import os
+import logging
 import pandas as pd
 logging.basicConfig(filename=os.path.join(data_path,logging_file), 
                    filemode='w', 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    level=logging_level)
 #logging.info('message')
 # ### Read the proposals
 # In[5]:
-# proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
+import os
-# col_id = 'id'
+import re
-# cols_content = ['title','description','summary']
+import numpy as np
-# proposals_input_df = proposals_input_df[[col_id]+cols_content]
+import pandas as pd
 from unicodedata import normalize 
 import sys
 # ### Create file: Taggings. Each line is a Tag associated to a Proposal
 # In[6]:
-taggings_file_cols = ['tag_id','taggable_id','taggable_type']
+import stanza
-taggings_file_df = pd.DataFrame(columns=taggings_file_cols)
+if stanza_download:
-row = [0,1,'Proposal']
+    stanza.download(stanza_model_lang)
 taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True)
 taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False)
 # IF NEEDED define 'pos_batch_size': 10000 in the next cell, config options.
 config = {
        'processors': 'tokenize,mwt,pos,lemma',
        'lang': stanza_model_lang
         }
 #not using depparse
 nlp = stanza.Pipeline(**config) 
 # ### Create file: Tags. List of Tags with the number of times they have been used
 # In[7]:
-tags_file_cols = ['id','name','taggings_count','kind']
+import tqdm
-tags_file_df = pd.DataFrame(columns=tags_file_cols)
+from tqdm.notebook import tqdm_notebook
-row = [0,'tag',0,'']
+tqdm_notebook.pandas()
-tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True)
+# to use tqdm in pandas use progress_apply instead of apply
 tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False)
 # ### Create file: List of related proposals
 # In[8]:
-numb_related_proposals = 2
+import nltk
-related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)]
+if nltk_download:
-related_props_df = pd.DataFrame(columns=related_props_cols)
+    nltk.download('stopwords')
-row = [1]+['' for num in range(1,numb_related_proposals+1)]
+    nltk.download('punkt')
-related_props_df = related_props_df.append(dict(zip(related_props_cols,row)), ignore_index=True)
+
-related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False)
+from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 from nltk.tokenize import word_tokenize, sent_tokenize
 # In[9]:
 import gensim
 from gensim.models.phrases import Phrases, Phraser
 # In[10]:
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import NMF
 # In[ ]:
 # In[ ]:
 # # Read the proposals and join the content to use in the topic modelling
 # In[ ]:
 check_file(os.path.join(data_path,inputjsonfile))
 proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
 proposals_input_df = proposals_input_df[[col_id]+cols_content]
 # In[ ]:
 # TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
 if len(proposals_input_df) == 0:
    logging.info('No Proposals found.')
    with open(os.path.join(data_path,taggings_filename), 'w') as file:
        file.write('[]')
    with open(os.path.join(data_path,tags_filename), 'w') as file:
        file.write('[]')
    with open(os.path.join(data_path,related_props_filename), 'w') as file:
        file.write('[]')
    os._exit(0)
 # In[11]:
 # Normalise characters
 for col in cols_content:
    proposals_input_df[col] = proposals_input_df[col].apply(lambda x: normalize('NFKC',x))
 proposals_input_df['joined_content'] = proposals_input_df[cols_content].agg('\n'.join, axis=1)
 proposals_input_df = proposals_input_df.drop(columns=list(set(cols_content)-{col_title}))
 # In[ ]:
 # # Lemmatise the content
 # In[12]:
 proposals_input_df['joined_content_topicmodelling'] = proposals_input_df['joined_content']
 # In[13]:
 # Using Stanza from Stanford NLP group
 def content_processing_for_topicmodelling_1(txt):
    # Delete html tags and urls
    tmp_txt = re.sub("<[^<]+?>","",txt)
    tmp_txt = re.sub(r"http[^\s]+?\s","",tmp_txt)
    tmp_txt = re.sub(r"http[^\s]+?$","",tmp_txt)
    tmp_txt = re.sub(r"www[^\s]+?\s","",tmp_txt)
    tmp_txt = re.sub(r"www[^\s]+?$","",tmp_txt)
    # Tokenise, lemmatise and select only the nouns
    new_txt_tok = []
    if len(re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ]","",tmp_txt).rstrip("\n")) != 0:
        tmp_txt_nlp = nlp(tmp_txt)
        for sent in tmp_txt_nlp.sentences:
            for token in sent.words:
                if noun_lemmatisation:
                    if token.upos == 'NOUN':
                        new_txt_tok.append(token.lemma)
                else:
                    new_txt_tok.append(token.text)
    return new_txt_tok
 # In[14]:
 if tqdm_notebook:
    proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
        'joined_content_topicmodelling'].progress_apply(content_processing_for_topicmodelling_1)
 else:
    proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
        'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_1)
 # In[ ]:
 #  
 # # Clean the data
 # In[16]:
 # Includes some extra steps for Spanish
 # List of stop words to be removed
 stop_words = set(stopwords.words(stopwords_lang))
 if stopwords_lang == 'spanish':
    for word in stop_words:
        stop_words = stop_words.union({re.sub(r"á","a",word)})
        stop_words = stop_words.union({re.sub(r"é","e",word)})
        stop_words = stop_words.union({re.sub(r"í","i",word)})
        stop_words = stop_words.union({re.sub(r"ó","o",word)})
        stop_words = stop_words.union({re.sub(r"ú","u",word)})
 # additional terms removed when found as an independent character
 if stopwords_lang == 'spanish':
    additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
 else:
    additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
 all_stop_words = stop_words.union(additional_stop_words)
 # In[17]:
 def content_processing_for_topicmodelling_2(txt_tok):    
    new_text_tok = []
    for word in txt_tok:
        new_word = word.lower()
        new_word = re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ\s]","",new_word)
        new_word = re.sub(r"[0-9]+","",new_word)
        new_word = new_word.rstrip("\n")
        if (len(new_word) != 0) and (new_word not in all_stop_words):
            new_text_tok.append(new_word)
    return new_text_tok
 # In[18]:
 proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
    'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_2)
 # In[ ]:
 # # Detect n-grams
 # In[19]:
 txt_unigram = proposals_input_df['joined_content_topicmodelling'].tolist()
 phrases_bigrams = Phrases(txt_unigram, min_count=n_gram_min_count)
 txt_bigram = [phrases_bigrams[txt] for txt in txt_unigram]
 txt_bigram_joined = [' '.join(txt) for txt in txt_bigram]
 # may contain also cuadrigrams when joining 2 bigrams:
 # phrases_trigrams = Phrases(txt_bigram, min_count=n_gram_min_count)
 # txt_trigram = [phrases_trigrams[txt] for txt in txt_bigram]
 # txt_trigram_joined = [' '.join(txt) for txt in txt_trigram]
 proposals_input_df['joined_content_topicmodelling'] = txt_bigram_joined
 # proposals_input_df['joined_content_topicmodelling'] = txt_trigram_joined
 # In[ ]:
 # In[ ]:
 # # Topic modelling (NMF)
 # In[20]:
 df_col_to_use = proposals_input_df['joined_content_topicmodelling']
 # NUMBER OF TOPICS
 n_components = numb_topics
 # SELECT the TOP n_top_words WORDS for each topic
 n_top_words = numb_topkeywords_pertopic
 # Use tf-idf features for NMF
 tfidf_vectorizer = TfidfVectorizer(max_df=max_df_val, min_df=min_df_val,
                                   max_features=n_features)
 tfidf = tfidf_vectorizer.fit_transform(df_col_to_use.tolist())
 # In[21]:
 # Includes some extra steps for Spanish
 def cleaning_features(top_features):
    clean_features = top_features.copy()
    for feature in clean_features:
        if feature+'s' in clean_features: clean_features[max(
            clean_features.index(feature),clean_features.index(feature+'s'))] = ''
    if stopwords_lang == 'spanish':
        for feature in clean_features:
            if feature+'es' in clean_features: clean_features[max(
                clean_features.index(feature),clean_features.index(feature+'es'))] = ''
        for feature in clean_features:
            if feature+'r' in clean_features: clean_features[max(
                clean_features.index(feature),clean_features.index(feature+'r'))] = ''       
    nosign_features = clean_features.copy()
    if stopwords_lang == 'spanish':
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"á","a",fet)
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"é","e",fet)
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"í","i",fet)
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"ó","o",fet)
        for pos,fet in enumerate(nosign_features):
            nosign_features[pos]=re.sub(r"ú","u",fet)  
    for pos,fet in enumerate(nosign_features):
        if fet in nosign_features[pos+1:]:
            clean_features[max(pos_2 for pos_2,fet_2 in enumerate(nosign_features) if fet_2 == fet)] = ''
    return clean_features       
 # Fit the NMF model
 nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
 # nmf.components_ is the H matrix 
 # W = nmf.fit_transform(tfidf)
 tfidf_feature_names = tfidf_vectorizer.get_feature_names()
 # Size of the vocabulary and the nmf matrix
 #print(len(tfidf_vectorizer.vocabulary_))
 #print(len(tfidf_feature_names))
 #nmf.components_.shape
 # In[ ]:
 # ### Create file: Repr_Prop. Most representative proposal for each topic
 # In[22]:
 W = nmf.fit_transform(tfidf)
 #print(W.shape)
 repr_prop_df = pd.DataFrame(columns=repr_prop_cols)
 for topic_index in range(n_components):
    top_indices = np.argsort( W[:,topic_index] )[::-1]
    top_represent_proposals = []
    for proposal_index in top_indices[0:n_top_represent_props]:
        top_represent_proposals.append(proposal_index)
    for prop_internal_index in top_represent_proposals:
        row = [topic_index,
              proposals_input_df.loc[int(prop_internal_index),'id'],
              proposals_input_df.loc[int(prop_internal_index),'title']]
        repr_prop_df = repr_prop_df.append(dict(zip(repr_prop_cols,row)), ignore_index=True)
 # In[23]:
 repr_prop_df.to_json(os.path.join(data_path,repr_prop_filename),orient="records", force_ascii=False)
 repr_prop_df.to_csv(os.path.join(data_path,repr_prop_filename_csv), index=False)
 # In[ ]:
 # ### Create file: Topics_Tags. List of Topics with their top Tags
 # In[24]:
 topics_tags_df = pd.DataFrame(columns=['id']+tag_cols)
 # nmf.components_ is the H matrix 
 for topic_idx, topic in enumerate(nmf.components_):
    obj_temp = [tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    clean_obj_temp = cleaning_features(obj_temp)
    clean_obj_temp.insert(0, str(topic_idx))
    #print(clean_obj_temp)
    topics_tags_df = topics_tags_df.append(dict(zip(['id']+tag_cols,clean_obj_temp)), ignore_index=True)
 # In[25]:
 topics_tags_df.to_json(os.path.join(data_path,topics_tags_filename),orient="records", force_ascii=False)
 topics_tags_df.to_csv(os.path.join(data_path,topics_tags_filename_csv), index=False)
 # In[ ]:
 # ### Create file: Taggings. Each line is a Tag associated to a Proposal
 # In[26]:
 # Coefficients for following calculation
 tag_coefs_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)]
 topics_tags_coefs_df = pd.DataFrame(columns=['id']+tag_coefs_cols)
 # nmf.components_ is the H matrix 
 for topic_idx, topic in enumerate(nmf.components_):
    topics_tags_coefs_temp = []
    topics_tags_coefs_temp.append(int(topic_idx))
    for i in topic.argsort()[:-n_top_words - 1:-1]:
        topics_tags_coefs_temp.append(topic[i])
    topics_tags_coefs_df = topics_tags_coefs_df.append(dict(zip(['id']+tag_coefs_cols,
                                                                topics_tags_coefs_temp)), ignore_index=True)
 for col in tag_cols:
    for topic_idx,topic in enumerate(topics_tags_df[col].tolist()):
        if topic == '':
            topics_tags_coefs_df.loc[int(topic_idx),col] = 0.0
 # In[27]:
 topics_tags_flat = []
 for idx,topic in topics_tags_df.iterrows():
    topics_tags_flat = topics_tags_flat + topics_tags_df.loc[idx,tag_cols].tolist()
 topics_tags_coefs_flat = []
 for idx,topic in topics_tags_coefs_df.iterrows():
    topics_tags_coefs_flat = topics_tags_coefs_flat + topics_tags_coefs_df.loc[idx,tag_coefs_cols].tolist()
 # In[28]:
 taggings_file_df = pd.DataFrame(columns=taggings_file_cols)
 for prop_idx,prop in tqdm.tqdm(enumerate(W),total=len(W)):
    proposal_topics_temp = np.zeros((len(topics_tags_flat)))
    cont = 0
    for weight in prop:
        for n in range(n_top_words):
            proposal_topics_temp[cont] = weight
            cont += 1
    proposal_tags_temp = proposal_topics_temp*topics_tags_coefs_flat
    # Adding the coefficients of same tags:
    for numterm_a,term_a in enumerate(topics_tags_flat):
        for numterm_b in reversed(range(numterm_a+1,len(topics_tags_flat))):
            term_b = topics_tags_flat[numterm_b]
            if (term_a == term_b):                
                proposal_tags_temp[numterm_a] = proposal_tags_temp[numterm_a] + proposal_tags_temp[numterm_b]
                proposal_tags_temp[numterm_b] = 0   
    for i in proposal_tags_temp.argsort()[:-n_top_words - 1:-1]:
        row = [i,proposals_input_df.loc[prop_idx,'id'],'Proposal']        
        taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True)
 # ### Create file: Tags. List of Tags with the number of times they have been used
 # In[29]:
 tags_file_df = pd.DataFrame(columns=tags_file_cols)
 for tag_id,tag in enumerate(topics_tags_flat):
    row = [tag_id,tag,0,'']
    tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True)
 for tag_id in taggings_file_df[taggings_file_cols_id].tolist():
    tags_file_df.loc[tag_id,tags_file_cols_count] = tags_file_df.loc[tag_id,tags_file_cols_count]+1
 # ### Deleting duplicate tags from files Tag and Taggings before saving them
 # In[30]:
 change_rows = []
 repeated_ids = []
 for idx1,row1 in tags_file_df.iterrows():
    for idx2,row2 in tags_file_df.iterrows():
        if (idx2 > idx1) and (idx2 not in repeated_ids) and (row1['name'] == row2['name']):
            change_rows.append((idx1,idx2))
            repeated_ids.append(idx2)
 tags_file_df = tags_file_df.drop(repeated_ids)
 # In[31]:
 for c_row in change_rows:
    taggings_file_df['tag_id'] = taggings_file_df['tag_id'].apply(lambda x: c_row[0] if x == c_row[1] else x)
 # In[32]:
 tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False)
 tags_file_df.to_csv(os.path.join(data_path,tags_filename_csv), index=False)
 # In[33]:
 taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False)
 taggings_file_df.to_csv(os.path.join(data_path,taggings_filename_csv), index=False)
 # In[ ]:
 # In[47]:
 # proposals_input_df
 # In[48]:
 # repr_prop_df
 # In[49]:
 # topics_tags_df
 # In[50]:
 # taggings_file_df
 # In[51]:
 # tags_file_df
 # In[ ]:
 # # LIST OF RELATED PROPOSALS
 # In[34]:
 proposal_topics_coefs_cols = ['id','topic_coefs']
 proposal_topics_coefs_df = pd.DataFrame(columns=proposal_topics_coefs_cols)
 for prop_idx,prop in enumerate(W):
    row = [proposals_input_df.loc[prop_idx,'id'],prop.copy()]
    proposal_topics_coefs_df = proposal_topics_coefs_df.append(dict(zip(proposal_topics_coefs_cols,row)),
                                                               ignore_index=True)
 # In[35]:
 related_props_df = pd.DataFrame(columns=related_props_cols)
 for idx,row in tqdm.tqdm(proposal_topics_coefs_df.iterrows(),total=len(proposal_topics_coefs_df)):
    prop_related_temp = []
    prop_related_temp.append(int(row['id']))
    vectora = row['topic_coefs']
    distances = [np.linalg.norm(vectora-vectorb) for vectorb in proposal_topics_coefs_df['topic_coefs'].tolist()]
    # the vector contains also the id of the initial proposal, thus numb_related_proposals+1
    for i in np.array(distances).argsort()[0:numb_related_proposals+1]:
        if distances[i] != 0.0:
            prop_related_temp.append(int(proposals_input_df.loc[i,'id']))
    # in case there are less related proposals than the max number
    while len(prop_related_temp) < numb_related_proposals+1:
        prop_related_temp.append('')
    related_props_df = related_props_df.append(dict(zip(related_props_cols,prop_related_temp)), ignore_index=True)
 # In[36]:
 related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False)
 related_props_df.to_csv(os.path.join(data_path,related_props_filename_csv), index=False)
 # In[ ]:
 # In[45]:
 #proposal_topics_coefs_df
 # In[46]:
 #related_props_df
 # In[44]:
 logging.info('Script executed correctly.')
 # In[ ]:
--- a/public/machine_learning/scripts/proposals_summary_comments_textrank.py
+++ b/public/machine_learning/scripts/proposals_summary_comments_textrank.py
@@ -5,55 +5,545 @@
 """
-Proposals comments summaries - Dummy script
+Proposals comments summaries
 This script generates for each proposal a summary of all its comments.
 Running time: Max 1 hour for 10.000 proposals.
 Technique used: GloVe embeddings and TextRank.
 More info in: https://github.com/consul-ml/consul-ml
 """
 # In[2]:
-data_path = '../data'
+# DOWNLOAD THE GLOVE EMBEDDINGS, IN THE DATA FOLDER:
-config_file = 'proposals_summary_comments_textrank.ini'
+
-logging_file ='proposals_summary_comments_textrank.log'
+# ENGLISH:
 #!wget https://nlp.stanford.edu/data/glove.6B.zip
 #!gunzip glove.6B.zip
 # SPANISH:
 #!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz
 #!gunzip glove-sbwc*.gz 
 # In[ ]:
 def check_file(file_name):
    if os.path.isfile(file_name):
        return
    else:
        try:
            logging.info('Missing file in Proposals comments summaries: ' + str(file_name))
        except NameError:
            print('No logging')
        with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
            file.write('[]')
        os._exit(0)
 # In[ ]:
 # Input file:
 inputjsonfile = 'comments.json'
 col_id = 'commentable_id'
 col_content = 'body'
 # Output files:
 comments_summaries_filename = 'ml_comments_summaries_proposals.json'
 comments_summaries_filename_csv = 'ml_comments_summaries_proposals.csv'
 tqdm_notebook = True
 # In[3]:
-# Input file:
+data_path = '../data'
-inputjsonfile = 'comments.json'
+config_file = 'proposals_summary_comments_textrank.ini'
-
+logging_file ='proposals_summary_comments_textrank.log'
 # Output files:
 comments_summaries_filename = 'ml_comments_summaries_proposals.json'
 # In[4]:
 # Read the configuration file
 import os
-import pandas as pd
+import configparser
 config = configparser.ConfigParser()
 check_file(os.path.join(data_path,config_file))
 config.read(os.path.join(data_path,config_file))
 sent_token_lang = config['PREPROCESSING']['sent_token_lang']
 stopwords_lang = config['PREPROCESSING']['stopwords_lang']
 nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
 if stopwords_lang == 'spanish':
    glove_file = config['SUMMARISATION']['glove_file_es']
 if stopwords_lang == 'english':
    glove_file = config['SUMMARISATION']['glove_file_en']    
 threshold_factor = config['SUMMARISATION'].getfloat('threshold_factor')
 max_size_of_summaries = config['SUMMARISATION'].getint('max_size_of_summaries')
 logging_level = config['LOGGING']['logging_level']
 # ### Read the comments
 # In[5]:
-# comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
+import logging
 # col_id = 'commentable_id'
 # col_content = 'body'
 # comments_input_df = comments_input_df[[col_id]+[col_content]]
 logging.basicConfig(filename=os.path.join(data_path,logging_file), 
                    filemode='w', 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    level=logging_level)
 #logging.info('message')
 # ### Create file. Comments summaries
 # In[6]:
-comments_summaries_cols = ['id','commentable_id','commentable_type','body']
+import os
-comments_summaries_df = pd.DataFrame(columns=comments_summaries_cols)
+import pandas as pd
-row = [0,0,'Proposal','Summary']
+import numpy as np
-comments_summaries_df = comments_summaries_df.append(dict(zip(comments_summaries_cols,row)), ignore_index=True)
+import re
-comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False)
+from unicodedata import normalize
 import sys
 # In[7]:
 import nltk
 if nltk_download:
    nltk.download('stopwords')
    nltk.download('punkt')
 # In[8]:
 from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 from nltk.tokenize import word_tokenize, sent_tokenize
 # In[9]:
 from gensim.scripts.glove2word2vec import glove2word2vec
 from gensim.models import KeyedVectors
 # In[10]:
 from sklearn.metrics.pairwise import cosine_similarity
 import networkx as nx
 import collections
 # In[11]:
 import tqdm
 from tqdm.notebook import tqdm_notebook
 tqdm_notebook.pandas()
 # to use tqdm in pandas use progress_apply instead of apply
 # In[12]:
 # Different code for Spanish and English vectors
 # Extract word vectors
 check_file(os.path.join(data_path,glove_file))
 if stopwords_lang == 'english':
    non_keyed_embs = os.path.join(data_path,glove_file)
    keyed_embs = os.path.join(data_path,glove_file+'.vec')
    if (not os.path.isfile(keyed_embs)) and (os.path.isfile(non_keyed_embs)):
        glove2word2vec(non_keyed_embs, keyed_embs)
    glove_file = glove_file+'.vec'
 word_embeddings = {}
 f = open(os.path.join(data_path,glove_file), encoding='utf-8')
 for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
 f.close()
 # In[ ]:
 # # Read the comments and join the comments belonging to the same proposal
 # In[ ]:
 check_file(os.path.join(data_path,inputjsonfile))
 comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
 comments_input_df = comments_input_df[comments_input_df['commentable_type'] == 'Proposal']
 # In[ ]:
 # TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
 if len(comments_input_df) == 0:
    logging.info('No Proposals comments found to summarise.')
    with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
        file.write('[]')
    os._exit(0)
 # In[13]:
 comments_input_df = comments_input_df[[col_id]+[col_content]]
 # Normalise characters
 comments_input_df[col_content] = comments_input_df[col_content].apply(lambda x: normalize('NFKC',x))
 comments_input_df = comments_input_df.sort_values(by=col_id)
 comments_input_df.reset_index(drop=True,inplace=True)
 # In[14]:
 # Drop empty texts
 empty_txt_ids = []
 for idx,row in comments_input_df.iterrows():
    if row['body'].strip() == '':
        empty_txt_ids.append(idx)
 comments_input_df = comments_input_df.drop(empty_txt_ids)
 comments_input_df.reset_index(drop=True,inplace=True)
 # In[15]:
 comments_df = pd.DataFrame()
 temp_comments_joined = []
 temp_comments_number = []
 temp_proposal_id = []
 for prop_id in sorted(list(set(comments_input_df[col_id].tolist()))):
    temp_list = comments_input_df[comments_input_df[col_id] == prop_id][col_content].tolist()
    temp_comments_joined.append('\n'.join(temp_list))
    temp_comments_number.append(len(temp_list))
    temp_proposal_id.append(prop_id)
 comments_df['prop_id'] = temp_proposal_id
 comments_df['comments_joined'] = temp_comments_joined
 comments_df['comments_number'] = temp_comments_number
 # In[16]:
 # # Stats
 # print(len(comments_df))
 # print(len(comments_df[(comments_df['comments_number'] >= 0) & (comments_df['comments_number'] < 10)]))
 # print(len(comments_df[(comments_df['comments_number'] >= 10) & (comments_df['comments_number'] < 50)]))
 # print(len(comments_df[(comments_df['comments_number'] >= 50) & (comments_df['comments_number'] < 900)]))
 # In[ ]:
 # # Make comments lowercase
 # In[17]:
 comments_df['comments_joined'] = comments_df['comments_joined'].apply(lambda x: x.lower())
 # In[ ]:
 # # Split sentences
 # In[18]:
 def split_sentences(txt):
    new_text_1 = sent_tokenize(txt,sent_token_lang)
    #outputs [] if txt is ''; or made of ' ' or '\n'
    new_text_2 = []    
    if new_text_1 != []:
        for tok1 in new_text_1:
            new_text_2 += tok1.split('\n')
            #outputs [''] if txt is ''
        new_text_2 = [tok.strip() for tok in new_text_2 if tok.strip() != '']
    if new_text_2 == []:
        new_text_2 = ['']
    return new_text_2
 # In[19]:
 comments_df['comments_sentences'] = comments_df['comments_joined'].apply(split_sentences)
 # In[ ]:
 # # Calculate sentence embeddings
 # In[20]:
 # Includes some extra steps for Spanish
 # List of stop words to be removed
 stop_words = set(stopwords.words(stopwords_lang))
 if stopwords_lang == 'spanish':
    for word in stop_words:
        stop_words = stop_words.union({re.sub(r"á","a",word)})
        stop_words = stop_words.union({re.sub(r"é","e",word)})
        stop_words = stop_words.union({re.sub(r"í","i",word)})
        stop_words = stop_words.union({re.sub(r"ó","o",word)})
        stop_words = stop_words.union({re.sub(r"ú","u",word)})
 # additional terms removed when found as an independent character
 if stopwords_lang == 'spanish':
    additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
 else:
    additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
 all_stop_words = stop_words.union(additional_stop_words)
 # In[21]:
 def sentences_embeddings(sents):
    sent_embs = []
    for sent in sents:           
        words = set(word_tokenize(sent))
        words = words-all_stop_words
        if len(words) != 0:
            emb = sum([word_embeddings.get(word, np.zeros(300)) for word in words])/(
                len(words)+0.001)
        else:
            emb = np.zeros(300)
        sent_embs.append(emb)
    return sent_embs
 # In[22]:
 if tqdm_notebook:
    comments_df['comments_sentences_embeddings'] = comments_df[
        'comments_sentences'].progress_apply(sentences_embeddings)
 else:
    comments_df['comments_sentences_embeddings'] = comments_df[
        'comments_sentences'].apply(sentences_embeddings)
 # In[ ]:
 # # Calculate sentence scores
 # In[23]:
 def sentences_scores(sents, sent_embs): 
    # similarity matrix
    if len(sent_embs) > 1:
        stacked_sent_embs = np.stack(sent_embs)
        sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
        np.fill_diagonal(sim_mat, 0)
    elif len(sent_embs) == 1:
        sim_mat = np.array([[0.]])
    else:
        return collections.OrderedDict([('',1.0)])
    nx_graph = nx.from_numpy_array(sim_mat)
    try:
        sentence_weight_temp = nx.pagerank(nx_graph)
    except:
        sentence_weight_temp = dict.fromkeys([x for x in range(len(sents))], 0)
    sentence_weights = {sents[key]: value for key, value in sentence_weight_temp.items()}
    sorted_sentence_weights = sorted(sentence_weights.items(), key=lambda elem: elem[1], reverse=True)
    sentence_scores = collections.OrderedDict(sorted_sentence_weights)
    return sentence_scores
 # In[24]:
 def plot_sentences_network(sents, sent_embs):
    import matplotlib.pyplot as plt
    # similarity matrix
    if len(sent_embs) > 1:
        stacked_sent_embs = np.stack(sent_embs)
        sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
        np.fill_diagonal(sim_mat, 0)
    elif len(sent_embs) == 1:
        sim_mat = np.array([[0.]])
    else:
        print('Nothing to plot')
        return
    nx_graph = nx.from_numpy_array(sim_mat)
    plt.plot()
    nx.draw(nx_graph, with_labels=True)
 # In[25]:
 comments_df['comments_sentences_scores'] = comments_df[['comments_sentences','comments_sentences_embeddings']].progress_apply(
    lambda row: sentences_scores(row['comments_sentences'],row['comments_sentences_embeddings']),axis=1)
 # In[ ]:
 # # Generate the summaries
 # In[26]:
 def comments_summary(sentence_weight, threshold_factor, *totalwords):
    threshold = threshold_factor * np.mean(list(sentence_weight.values()))
    sentence_counter = 0
    comments_summary = ''
    summary_num_words = 0   
    for sentence in sentence_weight:
        if sentence_weight[sentence] >= (threshold):
            if len(totalwords) == 0:
                comments_summary += "\n- " + sentence
                sentence_counter += 1
            elif summary_num_words < totalwords[0]:
                comments_summary += "\n- " + sentence
                sentence_counter += 1
                summary_num_words += len(sentence.split())
    comments_summary = comments_summary.lstrip()
    return comments_summary
 # In[27]:
 comments_df['comments_summary'] = comments_df['comments_sentences_scores'].apply(
    lambda x: comments_summary(x,threshold_factor,max_size_of_summaries))
 # In[28]:
 # comments_df
 # In[29]:
 # for idx,row in comments_input_df[comments_input_df['commentable_id'] == 10].iterrows():
 #     print(row['body'])
 #     print('-------')
 # In[30]:
 #print(comments_df.loc[8,'comments_summary'])
 # In[ ]:
 # In[31]:
 comments_df['commentable_type'] = ['Proposal']*len(comments_df)
 comments_summaries_df = comments_df[['prop_id','commentable_type','comments_summary']]
 comments_summaries_df.reset_index(level=0, inplace=True)
 # In[32]:
 comments_summaries_df = comments_summaries_df.rename(
    columns={"index": "id", "prop_id": "commentable_id", "comments_summary": "body"})
 # In[33]:
 #comments_summaries_df
 # In[34]:
 comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False)
 comments_summaries_df.to_csv(os.path.join(data_path,comments_summaries_filename_csv), index=False)
 # In[ ]:
 # In[ ]:
 # In[35]:
 logging.info('Script executed correctly.')