From 6d6888f20135efc950dc8ea9d1b810fb087ee137 Mon Sep 17 00:00:00 2001 From: cronopioelectronico Date: Tue, 7 Sep 2021 18:18:47 +0200 Subject: [PATCH] Update machine learning scripts with NNMF and TextRank-GloVe techniques --- .../budgets_related_content_and_tags_nmf.py | 743 +++++++++++++++++- .../budgets_summary_comments_textrank.py | 538 ++++++++++++- .../proposals_related_content_and_tags_nmf.py | 743 +++++++++++++++++- .../proposals_summary_comments_textrank.py | 540 ++++++++++++- 4 files changed, 2445 insertions(+), 119 deletions(-) diff --git a/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py b/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py index e02639be1..da40f73b3 100644 --- a/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py +++ b/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py @@ -5,11 +5,63 @@ """ -Related Participatory Budgeting projects and Tags - Dummy script +Related Participatory Budgeting projects and Tags +This script generates for each project: a) Tags, b) List of related projects. +Running time: Max 2 hours for 10.000 projects. +Technique used: NNMF and Euclidean distance between projects. +More info in: https://github.com/consul-ml/consul-ml """ +# In[ ]: + + +def check_file(file_name): + if os.path.isfile(file_name): + return + else: + try: + logging.info('Missing file in Related Participatory Budgeting projects and Tags: ' + str(file_name)) + except NameError: + print('No logging') + with open(os.path.join(data_path,taggings_filename), 'w') as file: + file.write('[]') + with open(os.path.join(data_path,tags_filename), 'w') as file: + file.write('[]') + with open(os.path.join(data_path,related_props_filename), 'w') as file: + file.write('[]') + os._exit(0) + + +# In[ ]: + + +# Input file: +inputjsonfile = 'budget_investments.json' +col_id = 'id' +col_title = 'title' +cols_content = ['title','description'] + +# Output files: +topics_tags_filename = 'ml_topics_tags_budgets.json' +topics_tags_filename_csv = 'ml_topics_tags_budgets.csv' + +repr_prop_filename = 'ml_repr_budgets.json' +repr_prop_filename_csv = 'ml_repr_budgets.csv' + +taggings_filename = 'ml_taggings_budgets.json' +taggings_filename_csv = 'ml_taggings_budgets.csv' + +tags_filename = 'ml_tags_budgets.json' +tags_filename_csv = 'ml_tags_budgets.csv' + +related_props_filename = 'ml_related_content_budgets.json' +related_props_filename_csv = 'ml_related_content_budgets.csv' + +tqdm_notebook = True + + # In[2]: @@ -17,70 +69,691 @@ data_path = '../data' config_file = 'budgets_related_content_and_tags_nmf.ini' logging_file ='budgets_related_content_and_tags_nmf.log' +# Read the configuration file +import os +import configparser +config = configparser.ConfigParser() +check_file(os.path.join(data_path,config_file)) +config.read(os.path.join(data_path,config_file)) + +stanza_model_lang = config['PREPROCESSING']['stanza_model_lang'] +stopwords_lang = config['PREPROCESSING']['stopwords_lang'] +noun_lemmatisation = config['PREPROCESSING'].getboolean('noun_lemmatisation') +n_gram_min_count = config['PREPROCESSING'].getint('n_gram_min_count') +stanza_download = config['PREPROCESSING'].getboolean('stanza_download') +nltk_download = config['PREPROCESSING'].getboolean('nltk_download') + +numb_related_proposals = config['RELATED_PROPOSALS'].getint('numb_related_proposals') + +numb_topics = config['TOPIC_MODELLING'].getint('numb_topics') +numb_topkeywords_pertopic = config['TOPIC_MODELLING'].getint('numb_topkeywords_pertopic') +n_top_represent_props = config['TOPIC_MODELLING'].getint('n_top_represent_props') +n_features = config['TOPIC_MODELLING'].getint('n_features') +min_df_val = config['TOPIC_MODELLING'].getfloat('min_df_val') +max_df_val = config['TOPIC_MODELLING'].getfloat('max_df_val') + +logging_level = config['LOGGING']['logging_level'] + # In[3]: -# Input file: -inputjsonfile = 'budget_investments.json' +related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)] -# Output files: -taggings_filename = 'ml_taggings_budgets.json' -tags_filename = 'ml_tags_budgets.json' -related_props_filename = 'ml_related_content_budgets.json' +repr_prop_cols = ['topic_id','proposal_id','title'] +tags_file_cols = ['id','name','taggings_count','kind'] +taggings_file_cols = ['tag_id','taggable_id','taggable_type'] +tag_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)] + +tags_file_cols_count = 'taggings_count' +taggings_file_cols_id = 'tag_id' # In[4]: -import os -import pandas as pd +import logging +logging.basicConfig(filename=os.path.join(data_path,logging_file), + filemode='w', + format='%(asctime)s - %(levelname)s - %(message)s', + level=logging_level) +#logging.info('message') -# ### Read the proposals # In[5]: -# proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records") -# col_id = 'id' -# cols_content = ['title','description'] -# proposals_input_df = proposals_input_df[[col_id]+cols_content] +import os +import re +import numpy as np +import pandas as pd +from unicodedata import normalize +import sys -# ### Create file: Taggings. Each line is a Tag associated to a Proposal - # In[6]: -taggings_file_cols = ['tag_id','taggable_id','taggable_type'] -taggings_file_df = pd.DataFrame(columns=taggings_file_cols) -row = [0,1,'Budget::Investment'] -taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True) -taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False) +import stanza +if stanza_download: + stanza.download(stanza_model_lang) +# IF NEEDED define 'pos_batch_size': 10000 in the next cell, config options. +config = { + 'processors': 'tokenize,mwt,pos,lemma', + 'lang': stanza_model_lang + } +#not using depparse +nlp = stanza.Pipeline(**config) -# ### Create file: Tags. List of Tags with the number of times they have been used # In[7]: -tags_file_cols = ['id','name','taggings_count','kind'] -tags_file_df = pd.DataFrame(columns=tags_file_cols) -row = [0,'tag',0,''] -tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True) -tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False) +import tqdm +from tqdm.notebook import tqdm_notebook +tqdm_notebook.pandas() +# to use tqdm in pandas use progress_apply instead of apply -# ### Create file: List of related proposals - # In[8]: -numb_related_proposals = 2 -related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)] -related_props_df = pd.DataFrame(columns=related_props_cols) -row = [1]+['' for num in range(1,numb_related_proposals+1)] -related_props_df = related_props_df.append(dict(zip(related_props_cols,row)), ignore_index=True) -related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False) +import nltk +if nltk_download: + nltk.download('stopwords') + nltk.download('punkt') + +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize, sent_tokenize + + +# In[9]: + + +import gensim +from gensim.models.phrases import Phrases, Phraser + + +# In[10]: + + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.decomposition import NMF + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# # Read the proposals and join the content to use in the topic modelling + +# In[ ]: + + +check_file(os.path.join(data_path,inputjsonfile)) +proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records") +proposals_input_df = proposals_input_df[[col_id]+cols_content] + + +# In[ ]: + + +# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS +if len(proposals_input_df) == 0: + logging.info('No Proposals found.') + with open(os.path.join(data_path,taggings_filename), 'w') as file: + file.write('[]') + with open(os.path.join(data_path,tags_filename), 'w') as file: + file.write('[]') + with open(os.path.join(data_path,related_props_filename), 'w') as file: + file.write('[]') + os._exit(0) + + +# In[11]: + + +# Normalise characters +for col in cols_content: + proposals_input_df[col] = proposals_input_df[col].apply(lambda x: normalize('NFKC',x)) + +proposals_input_df['joined_content'] = proposals_input_df[cols_content].agg('\n'.join, axis=1) +proposals_input_df = proposals_input_df.drop(columns=list(set(cols_content)-{col_title})) + + +# In[ ]: + + + + + +# # Lemmatise the content + +# In[12]: + + +proposals_input_df['joined_content_topicmodelling'] = proposals_input_df['joined_content'] + + +# In[13]: + + +# Using Stanza from Stanford NLP group +def content_processing_for_topicmodelling_1(txt): + + # Delete html tags and urls + tmp_txt = re.sub("<[^<]+?>","",txt) + tmp_txt = re.sub(r"http[^\s]+?\s","",tmp_txt) + tmp_txt = re.sub(r"http[^\s]+?$","",tmp_txt) + tmp_txt = re.sub(r"www[^\s]+?\s","",tmp_txt) + tmp_txt = re.sub(r"www[^\s]+?$","",tmp_txt) + + # Tokenise, lemmatise and select only the nouns + new_txt_tok = [] + if len(re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ]","",tmp_txt).rstrip("\n")) != 0: + tmp_txt_nlp = nlp(tmp_txt) + + for sent in tmp_txt_nlp.sentences: + for token in sent.words: + if noun_lemmatisation: + if token.upos == 'NOUN': + new_txt_tok.append(token.lemma) + else: + new_txt_tok.append(token.text) + + return new_txt_tok + + +# In[14]: + + +if tqdm_notebook: + proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[ + 'joined_content_topicmodelling'].progress_apply(content_processing_for_topicmodelling_1) +else: + proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[ + 'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_1) + + +# In[ ]: + + + + + +# + +# # Clean the data + +# In[15]: + + +# Includes some extra steps for Spanish +# List of stop words to be removed +stop_words = set(stopwords.words(stopwords_lang)) + +if stopwords_lang == 'spanish': + for word in stop_words: + stop_words = stop_words.union({re.sub(r"á","a",word)}) + stop_words = stop_words.union({re.sub(r"é","e",word)}) + stop_words = stop_words.union({re.sub(r"í","i",word)}) + stop_words = stop_words.union({re.sub(r"ó","o",word)}) + stop_words = stop_words.union({re.sub(r"ú","u",word)}) + +# additional terms removed when found as an independent character +if stopwords_lang == 'spanish': + additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'} +else: + additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'} +all_stop_words = stop_words.union(additional_stop_words) + + +# In[16]: + + +def content_processing_for_topicmodelling_2(txt_tok): + new_text_tok = [] + for word in txt_tok: + new_word = word.lower() + new_word = re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ\s]","",new_word) + new_word = re.sub(r"[0-9]+","",new_word) + new_word = new_word.rstrip("\n") + if (len(new_word) != 0) and (new_word not in all_stop_words): + new_text_tok.append(new_word) + + return new_text_tok + + +# In[17]: + + +proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[ + 'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_2) + + +# In[ ]: + + + + + +# # Detect n-grams + +# In[18]: + + +txt_unigram = proposals_input_df['joined_content_topicmodelling'].tolist() + +phrases_bigrams = Phrases(txt_unigram, min_count=n_gram_min_count) +txt_bigram = [phrases_bigrams[txt] for txt in txt_unigram] +txt_bigram_joined = [' '.join(txt) for txt in txt_bigram] + +# may contain also cuadrigrams when joining 2 bigrams: +# phrases_trigrams = Phrases(txt_bigram, min_count=n_gram_min_count) +# txt_trigram = [phrases_trigrams[txt] for txt in txt_bigram] +# txt_trigram_joined = [' '.join(txt) for txt in txt_trigram] + +proposals_input_df['joined_content_topicmodelling'] = txt_bigram_joined +# proposals_input_df['joined_content_topicmodelling'] = txt_trigram_joined + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# # Topic modelling (NMF) + +# In[19]: + + +df_col_to_use = proposals_input_df['joined_content_topicmodelling'] + +# NUMBER OF TOPICS +n_components = numb_topics +# SELECT the TOP n_top_words WORDS for each topic +n_top_words = numb_topkeywords_pertopic + +# Use tf-idf features for NMF +tfidf_vectorizer = TfidfVectorizer(max_df=max_df_val, min_df=min_df_val, + max_features=n_features) + +tfidf = tfidf_vectorizer.fit_transform(df_col_to_use.tolist()) + + +# In[20]: + + +# Includes some extra steps for Spanish +def cleaning_features(top_features): + clean_features = top_features.copy() + for feature in clean_features: + if feature+'s' in clean_features: clean_features[max( + clean_features.index(feature),clean_features.index(feature+'s'))] = '' + + if stopwords_lang == 'spanish': + for feature in clean_features: + if feature+'es' in clean_features: clean_features[max( + clean_features.index(feature),clean_features.index(feature+'es'))] = '' + for feature in clean_features: + if feature+'r' in clean_features: clean_features[max( + clean_features.index(feature),clean_features.index(feature+'r'))] = '' + + nosign_features = clean_features.copy() + + if stopwords_lang == 'spanish': + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"á","a",fet) + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"é","e",fet) + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"í","i",fet) + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"ó","o",fet) + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"ú","u",fet) + + for pos,fet in enumerate(nosign_features): + if fet in nosign_features[pos+1:]: + clean_features[max(pos_2 for pos_2,fet_2 in enumerate(nosign_features) if fet_2 == fet)] = '' + + return clean_features + + +# Fit the NMF model +nmf = NMF(n_components=n_components, random_state=1, + alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf) + +# nmf.components_ is the H matrix +# W = nmf.fit_transform(tfidf) + +tfidf_feature_names = tfidf_vectorizer.get_feature_names() + +# Size of the vocabulary and the nmf matrix +#print(len(tfidf_vectorizer.vocabulary_)) +#print(len(tfidf_feature_names)) +#nmf.components_.shape + + +# In[ ]: + + + + + +# ### Create file: Repr_Prop. Most representative proposal for each topic + +# In[21]: + + +W = nmf.fit_transform(tfidf) +#print(W.shape) + +repr_prop_df = pd.DataFrame(columns=repr_prop_cols) + +for topic_index in range(n_components): + top_indices = np.argsort( W[:,topic_index] )[::-1] + top_represent_proposals = [] + for proposal_index in top_indices[0:n_top_represent_props]: + top_represent_proposals.append(proposal_index) + + for prop_internal_index in top_represent_proposals: + row = [topic_index, + proposals_input_df.loc[int(prop_internal_index),'id'], + proposals_input_df.loc[int(prop_internal_index),'title']] + repr_prop_df = repr_prop_df.append(dict(zip(repr_prop_cols,row)), ignore_index=True) + + +# In[22]: + + +repr_prop_df.to_json(os.path.join(data_path,repr_prop_filename),orient="records", force_ascii=False) +repr_prop_df.to_csv(os.path.join(data_path,repr_prop_filename_csv), index=False) + + +# In[ ]: + + + + + +# ### Create file: Topics_Tags. List of Topics with their top Tags + +# In[23]: + + +topics_tags_df = pd.DataFrame(columns=['id']+tag_cols) + +# nmf.components_ is the H matrix + +for topic_idx, topic in enumerate(nmf.components_): + obj_temp = [tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-n_top_words - 1:-1]] + clean_obj_temp = cleaning_features(obj_temp) + clean_obj_temp.insert(0, str(topic_idx)) + #print(clean_obj_temp) + topics_tags_df = topics_tags_df.append(dict(zip(['id']+tag_cols,clean_obj_temp)), ignore_index=True) + + +# In[24]: + + +topics_tags_df.to_json(os.path.join(data_path,topics_tags_filename),orient="records", force_ascii=False) +topics_tags_df.to_csv(os.path.join(data_path,topics_tags_filename_csv), index=False) + + +# In[ ]: + + + + + +# ### Create file: Taggings. Each line is a Tag associated to a Proposal + +# In[25]: + + +# Coefficients for following calculation +tag_coefs_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)] +topics_tags_coefs_df = pd.DataFrame(columns=['id']+tag_coefs_cols) + +# nmf.components_ is the H matrix + +for topic_idx, topic in enumerate(nmf.components_): + topics_tags_coefs_temp = [] + topics_tags_coefs_temp.append(int(topic_idx)) + for i in topic.argsort()[:-n_top_words - 1:-1]: + topics_tags_coefs_temp.append(topic[i]) + topics_tags_coefs_df = topics_tags_coefs_df.append(dict(zip(['id']+tag_coefs_cols, + topics_tags_coefs_temp)), ignore_index=True) + +for col in tag_cols: + for topic_idx,topic in enumerate(topics_tags_df[col].tolist()): + if topic == '': + topics_tags_coefs_df.loc[int(topic_idx),col] = 0.0 + + +# In[26]: + + +topics_tags_flat = [] +for idx,topic in topics_tags_df.iterrows(): + topics_tags_flat = topics_tags_flat + topics_tags_df.loc[idx,tag_cols].tolist() + +topics_tags_coefs_flat = [] +for idx,topic in topics_tags_coefs_df.iterrows(): + topics_tags_coefs_flat = topics_tags_coefs_flat + topics_tags_coefs_df.loc[idx,tag_coefs_cols].tolist() + + +# In[27]: + + +taggings_file_df = pd.DataFrame(columns=taggings_file_cols) + +for prop_idx,prop in tqdm.tqdm(enumerate(W),total=len(W)): + proposal_topics_temp = np.zeros((len(topics_tags_flat))) + cont = 0 + for weight in prop: + for n in range(n_top_words): + proposal_topics_temp[cont] = weight + cont += 1 + + proposal_tags_temp = proposal_topics_temp*topics_tags_coefs_flat + + # Adding the coefficients of same tags: + for numterm_a,term_a in enumerate(topics_tags_flat): + for numterm_b in reversed(range(numterm_a+1,len(topics_tags_flat))): + term_b = topics_tags_flat[numterm_b] + if (term_a == term_b): + proposal_tags_temp[numterm_a] = proposal_tags_temp[numterm_a] + proposal_tags_temp[numterm_b] + proposal_tags_temp[numterm_b] = 0 + + for i in proposal_tags_temp.argsort()[:-n_top_words - 1:-1]: + row = [i,proposals_input_df.loc[prop_idx,'id'],'Budget::Investment'] + taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True) + + +# ### Create file: Tags. List of Tags with the number of times they have been used + +# In[28]: + + +tags_file_df = pd.DataFrame(columns=tags_file_cols) + +for tag_id,tag in enumerate(topics_tags_flat): + row = [tag_id,tag,0,''] + tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True) + +for tag_id in taggings_file_df[taggings_file_cols_id].tolist(): + tags_file_df.loc[tag_id,tags_file_cols_count] = tags_file_df.loc[tag_id,tags_file_cols_count]+1 + + +# ### Deleting duplicate tags from files Tag and Taggings before saving them + +# In[29]: + + +change_rows = [] +repeated_ids = [] +for idx1,row1 in tags_file_df.iterrows(): + for idx2,row2 in tags_file_df.iterrows(): + if (idx2 > idx1) and (idx2 not in repeated_ids) and (row1['name'] == row2['name']): + change_rows.append((idx1,idx2)) + repeated_ids.append(idx2) + +tags_file_df = tags_file_df.drop(repeated_ids) + + +# In[30]: + + +for c_row in change_rows: + taggings_file_df['tag_id'] = taggings_file_df['tag_id'].apply(lambda x: c_row[0] if x == c_row[1] else x) + + +# In[31]: + + +tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False) +tags_file_df.to_csv(os.path.join(data_path,tags_filename_csv), index=False) + + +# In[32]: + + +taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False) +taggings_file_df.to_csv(os.path.join(data_path,taggings_filename_csv), index=False) + + +# In[ ]: + + + + + +# In[33]: + + +# proposals_input_df + + +# In[34]: + + +# repr_prop_df + + +# In[35]: + + +# topics_tags_df + + +# In[36]: + + +# taggings_file_df + + +# In[37]: + + +# tags_file_df + + +# In[ ]: + + + + + +# # LIST OF RELATED PROPOSALS + +# In[38]: + + +proposal_topics_coefs_cols = ['id','topic_coefs'] +proposal_topics_coefs_df = pd.DataFrame(columns=proposal_topics_coefs_cols) + +for prop_idx,prop in enumerate(W): + row = [proposals_input_df.loc[prop_idx,'id'],prop.copy()] + proposal_topics_coefs_df = proposal_topics_coefs_df.append(dict(zip(proposal_topics_coefs_cols,row)), + ignore_index=True) + + +# In[39]: + + +related_props_df = pd.DataFrame(columns=related_props_cols) + +for idx,row in tqdm.tqdm(proposal_topics_coefs_df.iterrows(),total=len(proposal_topics_coefs_df)): + prop_related_temp = [] + prop_related_temp.append(int(row['id'])) + vectora = row['topic_coefs'] + distances = [np.linalg.norm(vectora-vectorb) for vectorb in proposal_topics_coefs_df['topic_coefs'].tolist()] + + # the vector contains also the id of the initial proposal, thus numb_related_proposals+1 + for i in np.array(distances).argsort()[0:numb_related_proposals+1]: + if distances[i] != 0.0: + prop_related_temp.append(int(proposals_input_df.loc[i,'id'])) + + # in case there are less related proposals than the max number + while len(prop_related_temp) < numb_related_proposals+1: + prop_related_temp.append('') + + related_props_df = related_props_df.append(dict(zip(related_props_cols,prop_related_temp)), ignore_index=True) + + +# In[40]: + + +related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False) +related_props_df.to_csv(os.path.join(data_path,related_props_filename_csv), index=False) + + +# In[ ]: + + + + + +# In[41]: + + +#proposal_topics_coefs_df + + +# In[42]: + + +#related_props_df + + +# In[43]: + + +logging.info('Script executed correctly.') + + +# In[ ]: + + + diff --git a/public/machine_learning/scripts/budgets_summary_comments_textrank.py b/public/machine_learning/scripts/budgets_summary_comments_textrank.py index 9f81ad66d..1c0faf07b 100644 --- a/public/machine_learning/scripts/budgets_summary_comments_textrank.py +++ b/public/machine_learning/scripts/budgets_summary_comments_textrank.py @@ -1,59 +1,549 @@ #!/usr/bin/env python # coding: utf-8 -# In[1]: +# In[ ]: """ -Participatory Budgeting comments summaries - Dummy script +Participatory Budgeting comments summaries +This script generates for each budget project a summary of all its comments. +Running time: Max 1 hour for 10.000 proposals. +Technique used: GloVe embeddings and TextRank. +More info in: https://github.com/consul-ml/consul-ml """ -# In[2]: +# In[ ]: + + +# DOWNLOAD THE GLOVE EMBEDDINGS, IN THE DATA FOLDER: + +# ENGLISH: +#!wget https://nlp.stanford.edu/data/glove.6B.zip +#!gunzip glove.6B.zip + +# SPANISH: +#!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz +#!gunzip glove-sbwc*.gz + + +# In[ ]: + + +def check_file(file_name): + if os.path.isfile(file_name): + return + else: + try: + logging.info('Missing file in Participatory Budgeting comments summaries: ' + str(file_name)) + except NameError: + print('No logging') + with open(os.path.join(data_path,comments_summaries_filename), 'w') as file: + file.write('[]') + os._exit(0) + + +# In[ ]: + + +# Input file: +inputjsonfile = 'comments.json' +col_id = 'commentable_id' +col_content = 'body' + +# Output files: +comments_summaries_filename = 'ml_comments_summaries_budgets.json' +comments_summaries_filename_csv = 'ml_comments_summaries_budgets.csv' + +tqdm_notebook = True + + +# In[ ]: data_path = '../data' config_file = 'budgets_summary_comments_textrank.ini' logging_file ='budgets_summary_comments_textrank.log' +# Read the configuration file +import os +import configparser +config = configparser.ConfigParser() +check_file(os.path.join(data_path,config_file)) +config.read(os.path.join(data_path,config_file)) -# In[3]: +sent_token_lang = config['PREPROCESSING']['sent_token_lang'] +stopwords_lang = config['PREPROCESSING']['stopwords_lang'] +nltk_download = config['PREPROCESSING'].getboolean('nltk_download') + +if stopwords_lang == 'spanish': + glove_file = config['SUMMARISATION']['glove_file_es'] +if stopwords_lang == 'english': + glove_file = config['SUMMARISATION']['glove_file_en'] +threshold_factor = config['SUMMARISATION'].getfloat('threshold_factor') +max_size_of_summaries = config['SUMMARISATION'].getint('max_size_of_summaries') + +logging_level = config['LOGGING']['logging_level'] -# Input file: -inputjsonfile = 'comments.json' - -# Output files: -comments_summaries_filename = 'ml_comments_summaries_budgets.json' +# In[ ]: -# In[4]: +import logging + +logging.basicConfig(filename=os.path.join(data_path,logging_file), + filemode='w', + format='%(asctime)s - %(levelname)s - %(message)s', + level=logging_level) +#logging.info('message') + + +# In[ ]: import os import pandas as pd +import numpy as np +import re +from unicodedata import normalize +import sys -# ### Read the comments - -# In[5]: +# In[ ]: -# comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records") -# col_id = 'commentable_id' -# col_content = 'body' -# comments_input_df = comments_input_df[[col_id]+[col_content]] +import nltk +if nltk_download: + nltk.download('stopwords') + nltk.download('punkt') -# ### Create file. Comments summaries - -# In[6]: +# In[ ]: + + +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize, sent_tokenize + + +# In[ ]: + + +from gensim.scripts.glove2word2vec import glove2word2vec +from gensim.models import KeyedVectors + + +# In[ ]: + + +from sklearn.metrics.pairwise import cosine_similarity +import networkx as nx +import collections + + +# In[ ]: + + +import tqdm +from tqdm.notebook import tqdm_notebook +tqdm_notebook.pandas() +# to use tqdm in pandas use progress_apply instead of apply + + +# In[ ]: + + +# Different code for Spanish and English vectors +# Extract word vectors + +check_file(os.path.join(data_path,glove_file)) + +if stopwords_lang == 'english': + non_keyed_embs = os.path.join(data_path,glove_file) + keyed_embs = os.path.join(data_path,glove_file+'.vec') + if (not os.path.isfile(keyed_embs)) and (os.path.isfile(non_keyed_embs)): + glove2word2vec(non_keyed_embs, keyed_embs) + glove_file = glove_file+'.vec' + +word_embeddings = {} +f = open(os.path.join(data_path,glove_file), encoding='utf-8') +for line in f: + values = line.split() + word = values[0] + coefs = np.asarray(values[1:], dtype='float32') + word_embeddings[word] = coefs +f.close() + + +# In[ ]: + + + + + +# # Read the comments and join the comments belonging to the same proposal + +# In[ ]: + + +check_file(os.path.join(data_path,inputjsonfile)) +comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records") +comments_input_df = comments_input_df[comments_input_df['commentable_type'] == 'Budget::Investment'] + + +# In[ ]: + + +# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS +if len(comments_input_df) == 0: + logging.info('No Participatory Budgeting comments found to summarise.') + with open(os.path.join(data_path,comments_summaries_filename), 'w') as file: + file.write('[]') + os._exit(0) + + +# In[ ]: + + +comments_input_df = comments_input_df[[col_id]+[col_content]] + +# Normalise characters +comments_input_df[col_content] = comments_input_df[col_content].apply(lambda x: normalize('NFKC',x)) + +comments_input_df = comments_input_df.sort_values(by=col_id) +comments_input_df.reset_index(drop=True,inplace=True) + + +# In[ ]: + + +# Drop empty texts + +empty_txt_ids = [] +for idx,row in comments_input_df.iterrows(): + if row['body'].strip() == '': + empty_txt_ids.append(idx) + +comments_input_df = comments_input_df.drop(empty_txt_ids) +comments_input_df.reset_index(drop=True,inplace=True) + + +# In[ ]: + + +comments_df = pd.DataFrame() + +temp_comments_joined = [] +temp_comments_number = [] +temp_proposal_id = [] +for prop_id in sorted(list(set(comments_input_df[col_id].tolist()))): + temp_list = comments_input_df[comments_input_df[col_id] == prop_id][col_content].tolist() + temp_comments_joined.append('\n'.join(temp_list)) + temp_comments_number.append(len(temp_list)) + temp_proposal_id.append(prop_id) + +comments_df['prop_id'] = temp_proposal_id +comments_df['comments_joined'] = temp_comments_joined +comments_df['comments_number'] = temp_comments_number + + +# In[ ]: + + +# # Stats +# print(len(comments_df)) +# print(len(comments_df[(comments_df['comments_number'] >= 0) & (comments_df['comments_number'] < 10)])) +# print(len(comments_df[(comments_df['comments_number'] >= 10) & (comments_df['comments_number'] < 50)])) +# print(len(comments_df[(comments_df['comments_number'] >= 50) & (comments_df['comments_number'] < 900)])) + + +# In[ ]: + + + + + +# # Make comments lowercase + +# In[ ]: + + +comments_df['comments_joined'] = comments_df['comments_joined'].apply(lambda x: x.lower()) + + +# In[ ]: + + + + + +# # Split sentences + +# In[ ]: + + +def split_sentences(txt): + new_text_1 = sent_tokenize(txt,sent_token_lang) + #outputs [] if txt is ''; or made of ' ' or '\n' + + new_text_2 = [] + if new_text_1 != []: + for tok1 in new_text_1: + new_text_2 += tok1.split('\n') + #outputs [''] if txt is '' + new_text_2 = [tok.strip() for tok in new_text_2 if tok.strip() != ''] + + if new_text_2 == []: + new_text_2 = [''] + + return new_text_2 + + +# In[ ]: + + +comments_df['comments_sentences'] = comments_df['comments_joined'].apply(split_sentences) + + +# In[ ]: + + + + + +# # Calculate sentence embeddings + +# In[ ]: + + +# Includes some extra steps for Spanish +# List of stop words to be removed +stop_words = set(stopwords.words(stopwords_lang)) + +if stopwords_lang == 'spanish': + for word in stop_words: + stop_words = stop_words.union({re.sub(r"á","a",word)}) + stop_words = stop_words.union({re.sub(r"é","e",word)}) + stop_words = stop_words.union({re.sub(r"í","i",word)}) + stop_words = stop_words.union({re.sub(r"ó","o",word)}) + stop_words = stop_words.union({re.sub(r"ú","u",word)}) + +# additional terms removed when found as an independent character +if stopwords_lang == 'spanish': + additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'} +else: + additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'} +all_stop_words = stop_words.union(additional_stop_words) + + +# In[ ]: + + +def sentences_embeddings(sents): + sent_embs = [] + + for sent in sents: + words = set(word_tokenize(sent)) + words = words-all_stop_words + if len(words) != 0: + emb = sum([word_embeddings.get(word, np.zeros(300)) for word in words])/( + len(words)+0.001) + else: + emb = np.zeros(300) + sent_embs.append(emb) + + return sent_embs + + +# In[ ]: + + +if tqdm_notebook: + comments_df['comments_sentences_embeddings'] = comments_df[ + 'comments_sentences'].progress_apply(sentences_embeddings) +else: + comments_df['comments_sentences_embeddings'] = comments_df[ + 'comments_sentences'].apply(sentences_embeddings) + + +# In[ ]: + + + + + +# # Calculate sentence scores + +# In[ ]: + + +def sentences_scores(sents, sent_embs): + + # similarity matrix + if len(sent_embs) > 1: + stacked_sent_embs = np.stack(sent_embs) + sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs) + np.fill_diagonal(sim_mat, 0) + elif len(sent_embs) == 1: + sim_mat = np.array([[0.]]) + else: + return collections.OrderedDict([('',1.0)]) + + nx_graph = nx.from_numpy_array(sim_mat) + + try: + sentence_weight_temp = nx.pagerank(nx_graph) + except: + sentence_weight_temp = dict.fromkeys([x for x in range(len(sents))], 0) + + sentence_weights = {sents[key]: value for key, value in sentence_weight_temp.items()} + + sorted_sentence_weights = sorted(sentence_weights.items(), key=lambda elem: elem[1], reverse=True) + sentence_scores = collections.OrderedDict(sorted_sentence_weights) + + return sentence_scores + + +# In[ ]: + + +def plot_sentences_network(sents, sent_embs): + import matplotlib.pyplot as plt + + # similarity matrix + if len(sent_embs) > 1: + stacked_sent_embs = np.stack(sent_embs) + sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs) + np.fill_diagonal(sim_mat, 0) + elif len(sent_embs) == 1: + sim_mat = np.array([[0.]]) + else: + print('Nothing to plot') + return + + nx_graph = nx.from_numpy_array(sim_mat) + + plt.plot() + nx.draw(nx_graph, with_labels=True) + + +# In[ ]: + + +comments_df['comments_sentences_scores'] = comments_df[['comments_sentences','comments_sentences_embeddings']].progress_apply( + lambda row: sentences_scores(row['comments_sentences'],row['comments_sentences_embeddings']),axis=1) + + +# In[ ]: + + + + + +# # Generate the summaries + +# In[ ]: + + +def comments_summary(sentence_weight, threshold_factor, *totalwords): + + threshold = threshold_factor * np.mean(list(sentence_weight.values())) + + sentence_counter = 0 + comments_summary = '' + + summary_num_words = 0 + + for sentence in sentence_weight: + if sentence_weight[sentence] >= (threshold): + if len(totalwords) == 0: + comments_summary += "\n- " + sentence + sentence_counter += 1 + elif summary_num_words < totalwords[0]: + comments_summary += "\n- " + sentence + sentence_counter += 1 + summary_num_words += len(sentence.split()) + + comments_summary = comments_summary.lstrip() + return comments_summary + + +# In[ ]: + + +comments_df['comments_summary'] = comments_df['comments_sentences_scores'].apply( + lambda x: comments_summary(x,threshold_factor,max_size_of_summaries)) + + +# In[ ]: + + +# comments_df + + +# In[ ]: + + +# for idx,row in comments_input_df[comments_input_df['commentable_id'] == 10].iterrows(): +# print(row['body']) +# print('-------') + + +# In[ ]: + + +#print(comments_df.loc[8,'comments_summary']) + + +# In[ ]: + + + + + +# In[ ]: + + +comments_df['commentable_type'] = ['Budget::Investment']*len(comments_df) +comments_summaries_df = comments_df[['prop_id','commentable_type','comments_summary']] +comments_summaries_df.reset_index(level=0, inplace=True) + + +# In[ ]: + + +comments_summaries_df = comments_summaries_df.rename( + columns={"index": "id", "prop_id": "commentable_id", "comments_summary": "body"}) + + +# In[ ]: + + +#comments_summaries_df + + +# In[ ]: -comments_summaries_cols = ['id','commentable_id','commentable_type','body'] -comments_summaries_df = pd.DataFrame(columns=comments_summaries_cols) -row = [0,0,'Budget::Investment','Summary'] -comments_summaries_df = comments_summaries_df.append(dict(zip(comments_summaries_cols,row)), ignore_index=True) comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False) +comments_summaries_df.to_csv(os.path.join(data_path,comments_summaries_filename_csv), index=False) + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# In[ ]: + + +logging.info('Script executed correctly.') diff --git a/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py b/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py index 75d2ef02b..4c303ad28 100644 --- a/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py +++ b/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py @@ -5,11 +5,63 @@ """ -Related Proposals and Tags - Dummy script +Related Proposals and Tags +This script generates for each proposal: a) Tags, b) List of related proposals. +Running time: Max 2 hours for 10.000 proposals. +Technique used: NNMF and Euclidean distance between proposals. +More info in: https://github.com/consul-ml/consul-ml """ +# In[ ]: + + +def check_file(file_name): + if os.path.isfile(file_name): + return + else: + try: + logging.info('Missing file in Related Proposals and Tags: ' + str(file_name)) + except NameError: + print('No logging') + with open(os.path.join(data_path,taggings_filename), 'w') as file: + file.write('[]') + with open(os.path.join(data_path,tags_filename), 'w') as file: + file.write('[]') + with open(os.path.join(data_path,related_props_filename), 'w') as file: + file.write('[]') + os._exit(0) + + +# In[ ]: + + +# Input file: +inputjsonfile = 'proposals.json' +col_id = 'id' +col_title = 'title' +cols_content = ['title','description','summary'] + +# Output files: +topics_tags_filename = 'ml_topics_tags_proposals.json' +topics_tags_filename_csv = 'ml_topics_tags_proposals.csv' + +repr_prop_filename = 'ml_repr_proposals.json' +repr_prop_filename_csv = 'ml_repr_proposals.csv' + +taggings_filename = 'ml_taggings_proposals.json' +taggings_filename_csv = 'ml_taggings_proposals.csv' + +tags_filename = 'ml_tags_proposals.json' +tags_filename_csv = 'ml_tags_proposals.csv' + +related_props_filename = 'ml_related_content_proposals.json' +related_props_filename_csv = 'ml_related_content_proposals.csv' + +tqdm_notebook = True + + # In[2]: @@ -17,70 +69,691 @@ data_path = '../data' config_file = 'proposals_related_content_and_tags_nmf.ini' logging_file ='proposals_related_content_and_tags_nmf.log' +# Read the configuration file +import os +import configparser +config = configparser.ConfigParser() +check_file(os.path.join(data_path,config_file)) +config.read(os.path.join(data_path,config_file)) + +stanza_model_lang = config['PREPROCESSING']['stanza_model_lang'] +stopwords_lang = config['PREPROCESSING']['stopwords_lang'] +noun_lemmatisation = config['PREPROCESSING'].getboolean('noun_lemmatisation') +n_gram_min_count = config['PREPROCESSING'].getint('n_gram_min_count') +stanza_download = config['PREPROCESSING'].getboolean('stanza_download') +nltk_download = config['PREPROCESSING'].getboolean('nltk_download') + +numb_related_proposals = config['RELATED_PROPOSALS'].getint('numb_related_proposals') + +numb_topics = config['TOPIC_MODELLING'].getint('numb_topics') +numb_topkeywords_pertopic = config['TOPIC_MODELLING'].getint('numb_topkeywords_pertopic') +n_top_represent_props = config['TOPIC_MODELLING'].getint('n_top_represent_props') +n_features = config['TOPIC_MODELLING'].getint('n_features') +min_df_val = config['TOPIC_MODELLING'].getfloat('min_df_val') +max_df_val = config['TOPIC_MODELLING'].getfloat('max_df_val') + +logging_level = config['LOGGING']['logging_level'] + # In[3]: -# Input file: -inputjsonfile = 'proposals.json' +related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)] -# Output files: -taggings_filename = 'ml_taggings_proposals.json' -tags_filename = 'ml_tags_proposals.json' -related_props_filename = 'ml_related_content_proposals.json' +repr_prop_cols = ['topic_id','proposal_id','title'] +tags_file_cols = ['id','name','taggings_count','kind'] +taggings_file_cols = ['tag_id','taggable_id','taggable_type'] +tag_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)] + +tags_file_cols_count = 'taggings_count' +taggings_file_cols_id = 'tag_id' # In[4]: -import os -import pandas as pd +import logging +logging.basicConfig(filename=os.path.join(data_path,logging_file), + filemode='w', + format='%(asctime)s - %(levelname)s - %(message)s', + level=logging_level) +#logging.info('message') -# ### Read the proposals # In[5]: -# proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records") -# col_id = 'id' -# cols_content = ['title','description','summary'] -# proposals_input_df = proposals_input_df[[col_id]+cols_content] +import os +import re +import numpy as np +import pandas as pd +from unicodedata import normalize +import sys -# ### Create file: Taggings. Each line is a Tag associated to a Proposal - # In[6]: -taggings_file_cols = ['tag_id','taggable_id','taggable_type'] -taggings_file_df = pd.DataFrame(columns=taggings_file_cols) -row = [0,1,'Proposal'] -taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True) -taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False) +import stanza +if stanza_download: + stanza.download(stanza_model_lang) +# IF NEEDED define 'pos_batch_size': 10000 in the next cell, config options. +config = { + 'processors': 'tokenize,mwt,pos,lemma', + 'lang': stanza_model_lang + } +#not using depparse +nlp = stanza.Pipeline(**config) -# ### Create file: Tags. List of Tags with the number of times they have been used # In[7]: -tags_file_cols = ['id','name','taggings_count','kind'] -tags_file_df = pd.DataFrame(columns=tags_file_cols) -row = [0,'tag',0,''] -tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True) -tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False) +import tqdm +from tqdm.notebook import tqdm_notebook +tqdm_notebook.pandas() +# to use tqdm in pandas use progress_apply instead of apply -# ### Create file: List of related proposals - # In[8]: -numb_related_proposals = 2 -related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)] -related_props_df = pd.DataFrame(columns=related_props_cols) -row = [1]+['' for num in range(1,numb_related_proposals+1)] -related_props_df = related_props_df.append(dict(zip(related_props_cols,row)), ignore_index=True) -related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False) +import nltk +if nltk_download: + nltk.download('stopwords') + nltk.download('punkt') + +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize, sent_tokenize + + +# In[9]: + + +import gensim +from gensim.models.phrases import Phrases, Phraser + + +# In[10]: + + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.decomposition import NMF + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# # Read the proposals and join the content to use in the topic modelling + +# In[ ]: + + +check_file(os.path.join(data_path,inputjsonfile)) +proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records") +proposals_input_df = proposals_input_df[[col_id]+cols_content] + + +# In[ ]: + + +# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS +if len(proposals_input_df) == 0: + logging.info('No Proposals found.') + with open(os.path.join(data_path,taggings_filename), 'w') as file: + file.write('[]') + with open(os.path.join(data_path,tags_filename), 'w') as file: + file.write('[]') + with open(os.path.join(data_path,related_props_filename), 'w') as file: + file.write('[]') + os._exit(0) + + +# In[11]: + + +# Normalise characters +for col in cols_content: + proposals_input_df[col] = proposals_input_df[col].apply(lambda x: normalize('NFKC',x)) + +proposals_input_df['joined_content'] = proposals_input_df[cols_content].agg('\n'.join, axis=1) +proposals_input_df = proposals_input_df.drop(columns=list(set(cols_content)-{col_title})) + + +# In[ ]: + + + + + +# # Lemmatise the content + +# In[12]: + + +proposals_input_df['joined_content_topicmodelling'] = proposals_input_df['joined_content'] + + +# In[13]: + + +# Using Stanza from Stanford NLP group +def content_processing_for_topicmodelling_1(txt): + + # Delete html tags and urls + tmp_txt = re.sub("<[^<]+?>","",txt) + tmp_txt = re.sub(r"http[^\s]+?\s","",tmp_txt) + tmp_txt = re.sub(r"http[^\s]+?$","",tmp_txt) + tmp_txt = re.sub(r"www[^\s]+?\s","",tmp_txt) + tmp_txt = re.sub(r"www[^\s]+?$","",tmp_txt) + + # Tokenise, lemmatise and select only the nouns + new_txt_tok = [] + if len(re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ]","",tmp_txt).rstrip("\n")) != 0: + tmp_txt_nlp = nlp(tmp_txt) + + for sent in tmp_txt_nlp.sentences: + for token in sent.words: + if noun_lemmatisation: + if token.upos == 'NOUN': + new_txt_tok.append(token.lemma) + else: + new_txt_tok.append(token.text) + + return new_txt_tok + + +# In[14]: + + +if tqdm_notebook: + proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[ + 'joined_content_topicmodelling'].progress_apply(content_processing_for_topicmodelling_1) +else: + proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[ + 'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_1) + + +# In[ ]: + + + + + +# + +# # Clean the data + +# In[16]: + + +# Includes some extra steps for Spanish +# List of stop words to be removed +stop_words = set(stopwords.words(stopwords_lang)) + +if stopwords_lang == 'spanish': + for word in stop_words: + stop_words = stop_words.union({re.sub(r"á","a",word)}) + stop_words = stop_words.union({re.sub(r"é","e",word)}) + stop_words = stop_words.union({re.sub(r"í","i",word)}) + stop_words = stop_words.union({re.sub(r"ó","o",word)}) + stop_words = stop_words.union({re.sub(r"ú","u",word)}) + +# additional terms removed when found as an independent character +if stopwords_lang == 'spanish': + additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'} +else: + additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'} +all_stop_words = stop_words.union(additional_stop_words) + + +# In[17]: + + +def content_processing_for_topicmodelling_2(txt_tok): + new_text_tok = [] + for word in txt_tok: + new_word = word.lower() + new_word = re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ\s]","",new_word) + new_word = re.sub(r"[0-9]+","",new_word) + new_word = new_word.rstrip("\n") + if (len(new_word) != 0) and (new_word not in all_stop_words): + new_text_tok.append(new_word) + + return new_text_tok + + +# In[18]: + + +proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[ + 'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_2) + + +# In[ ]: + + + + + +# # Detect n-grams + +# In[19]: + + +txt_unigram = proposals_input_df['joined_content_topicmodelling'].tolist() + +phrases_bigrams = Phrases(txt_unigram, min_count=n_gram_min_count) +txt_bigram = [phrases_bigrams[txt] for txt in txt_unigram] +txt_bigram_joined = [' '.join(txt) for txt in txt_bigram] + +# may contain also cuadrigrams when joining 2 bigrams: +# phrases_trigrams = Phrases(txt_bigram, min_count=n_gram_min_count) +# txt_trigram = [phrases_trigrams[txt] for txt in txt_bigram] +# txt_trigram_joined = [' '.join(txt) for txt in txt_trigram] + +proposals_input_df['joined_content_topicmodelling'] = txt_bigram_joined +# proposals_input_df['joined_content_topicmodelling'] = txt_trigram_joined + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# # Topic modelling (NMF) + +# In[20]: + + +df_col_to_use = proposals_input_df['joined_content_topicmodelling'] + +# NUMBER OF TOPICS +n_components = numb_topics +# SELECT the TOP n_top_words WORDS for each topic +n_top_words = numb_topkeywords_pertopic + +# Use tf-idf features for NMF +tfidf_vectorizer = TfidfVectorizer(max_df=max_df_val, min_df=min_df_val, + max_features=n_features) + +tfidf = tfidf_vectorizer.fit_transform(df_col_to_use.tolist()) + + +# In[21]: + + +# Includes some extra steps for Spanish +def cleaning_features(top_features): + clean_features = top_features.copy() + for feature in clean_features: + if feature+'s' in clean_features: clean_features[max( + clean_features.index(feature),clean_features.index(feature+'s'))] = '' + + if stopwords_lang == 'spanish': + for feature in clean_features: + if feature+'es' in clean_features: clean_features[max( + clean_features.index(feature),clean_features.index(feature+'es'))] = '' + for feature in clean_features: + if feature+'r' in clean_features: clean_features[max( + clean_features.index(feature),clean_features.index(feature+'r'))] = '' + + nosign_features = clean_features.copy() + + if stopwords_lang == 'spanish': + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"á","a",fet) + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"é","e",fet) + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"í","i",fet) + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"ó","o",fet) + for pos,fet in enumerate(nosign_features): + nosign_features[pos]=re.sub(r"ú","u",fet) + + for pos,fet in enumerate(nosign_features): + if fet in nosign_features[pos+1:]: + clean_features[max(pos_2 for pos_2,fet_2 in enumerate(nosign_features) if fet_2 == fet)] = '' + + return clean_features + + +# Fit the NMF model +nmf = NMF(n_components=n_components, random_state=1, + alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf) + +# nmf.components_ is the H matrix +# W = nmf.fit_transform(tfidf) + +tfidf_feature_names = tfidf_vectorizer.get_feature_names() + +# Size of the vocabulary and the nmf matrix +#print(len(tfidf_vectorizer.vocabulary_)) +#print(len(tfidf_feature_names)) +#nmf.components_.shape + + +# In[ ]: + + + + + +# ### Create file: Repr_Prop. Most representative proposal for each topic + +# In[22]: + + +W = nmf.fit_transform(tfidf) +#print(W.shape) + +repr_prop_df = pd.DataFrame(columns=repr_prop_cols) + +for topic_index in range(n_components): + top_indices = np.argsort( W[:,topic_index] )[::-1] + top_represent_proposals = [] + for proposal_index in top_indices[0:n_top_represent_props]: + top_represent_proposals.append(proposal_index) + + for prop_internal_index in top_represent_proposals: + row = [topic_index, + proposals_input_df.loc[int(prop_internal_index),'id'], + proposals_input_df.loc[int(prop_internal_index),'title']] + repr_prop_df = repr_prop_df.append(dict(zip(repr_prop_cols,row)), ignore_index=True) + + +# In[23]: + + +repr_prop_df.to_json(os.path.join(data_path,repr_prop_filename),orient="records", force_ascii=False) +repr_prop_df.to_csv(os.path.join(data_path,repr_prop_filename_csv), index=False) + + +# In[ ]: + + + + + +# ### Create file: Topics_Tags. List of Topics with their top Tags + +# In[24]: + + +topics_tags_df = pd.DataFrame(columns=['id']+tag_cols) + +# nmf.components_ is the H matrix + +for topic_idx, topic in enumerate(nmf.components_): + obj_temp = [tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-n_top_words - 1:-1]] + clean_obj_temp = cleaning_features(obj_temp) + clean_obj_temp.insert(0, str(topic_idx)) + #print(clean_obj_temp) + topics_tags_df = topics_tags_df.append(dict(zip(['id']+tag_cols,clean_obj_temp)), ignore_index=True) + + +# In[25]: + + +topics_tags_df.to_json(os.path.join(data_path,topics_tags_filename),orient="records", force_ascii=False) +topics_tags_df.to_csv(os.path.join(data_path,topics_tags_filename_csv), index=False) + + +# In[ ]: + + + + + +# ### Create file: Taggings. Each line is a Tag associated to a Proposal + +# In[26]: + + +# Coefficients for following calculation +tag_coefs_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)] +topics_tags_coefs_df = pd.DataFrame(columns=['id']+tag_coefs_cols) + +# nmf.components_ is the H matrix + +for topic_idx, topic in enumerate(nmf.components_): + topics_tags_coefs_temp = [] + topics_tags_coefs_temp.append(int(topic_idx)) + for i in topic.argsort()[:-n_top_words - 1:-1]: + topics_tags_coefs_temp.append(topic[i]) + topics_tags_coefs_df = topics_tags_coefs_df.append(dict(zip(['id']+tag_coefs_cols, + topics_tags_coefs_temp)), ignore_index=True) + +for col in tag_cols: + for topic_idx,topic in enumerate(topics_tags_df[col].tolist()): + if topic == '': + topics_tags_coefs_df.loc[int(topic_idx),col] = 0.0 + + +# In[27]: + + +topics_tags_flat = [] +for idx,topic in topics_tags_df.iterrows(): + topics_tags_flat = topics_tags_flat + topics_tags_df.loc[idx,tag_cols].tolist() + +topics_tags_coefs_flat = [] +for idx,topic in topics_tags_coefs_df.iterrows(): + topics_tags_coefs_flat = topics_tags_coefs_flat + topics_tags_coefs_df.loc[idx,tag_coefs_cols].tolist() + + +# In[28]: + + +taggings_file_df = pd.DataFrame(columns=taggings_file_cols) + +for prop_idx,prop in tqdm.tqdm(enumerate(W),total=len(W)): + proposal_topics_temp = np.zeros((len(topics_tags_flat))) + cont = 0 + for weight in prop: + for n in range(n_top_words): + proposal_topics_temp[cont] = weight + cont += 1 + + proposal_tags_temp = proposal_topics_temp*topics_tags_coefs_flat + + # Adding the coefficients of same tags: + for numterm_a,term_a in enumerate(topics_tags_flat): + for numterm_b in reversed(range(numterm_a+1,len(topics_tags_flat))): + term_b = topics_tags_flat[numterm_b] + if (term_a == term_b): + proposal_tags_temp[numterm_a] = proposal_tags_temp[numterm_a] + proposal_tags_temp[numterm_b] + proposal_tags_temp[numterm_b] = 0 + + for i in proposal_tags_temp.argsort()[:-n_top_words - 1:-1]: + row = [i,proposals_input_df.loc[prop_idx,'id'],'Proposal'] + taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True) + + +# ### Create file: Tags. List of Tags with the number of times they have been used + +# In[29]: + + +tags_file_df = pd.DataFrame(columns=tags_file_cols) + +for tag_id,tag in enumerate(topics_tags_flat): + row = [tag_id,tag,0,''] + tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True) + +for tag_id in taggings_file_df[taggings_file_cols_id].tolist(): + tags_file_df.loc[tag_id,tags_file_cols_count] = tags_file_df.loc[tag_id,tags_file_cols_count]+1 + + +# ### Deleting duplicate tags from files Tag and Taggings before saving them + +# In[30]: + + +change_rows = [] +repeated_ids = [] +for idx1,row1 in tags_file_df.iterrows(): + for idx2,row2 in tags_file_df.iterrows(): + if (idx2 > idx1) and (idx2 not in repeated_ids) and (row1['name'] == row2['name']): + change_rows.append((idx1,idx2)) + repeated_ids.append(idx2) + +tags_file_df = tags_file_df.drop(repeated_ids) + + +# In[31]: + + +for c_row in change_rows: + taggings_file_df['tag_id'] = taggings_file_df['tag_id'].apply(lambda x: c_row[0] if x == c_row[1] else x) + + +# In[32]: + + +tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False) +tags_file_df.to_csv(os.path.join(data_path,tags_filename_csv), index=False) + + +# In[33]: + + +taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False) +taggings_file_df.to_csv(os.path.join(data_path,taggings_filename_csv), index=False) + + +# In[ ]: + + + + + +# In[47]: + + +# proposals_input_df + + +# In[48]: + + +# repr_prop_df + + +# In[49]: + + +# topics_tags_df + + +# In[50]: + + +# taggings_file_df + + +# In[51]: + + +# tags_file_df + + +# In[ ]: + + + + + +# # LIST OF RELATED PROPOSALS + +# In[34]: + + +proposal_topics_coefs_cols = ['id','topic_coefs'] +proposal_topics_coefs_df = pd.DataFrame(columns=proposal_topics_coefs_cols) + +for prop_idx,prop in enumerate(W): + row = [proposals_input_df.loc[prop_idx,'id'],prop.copy()] + proposal_topics_coefs_df = proposal_topics_coefs_df.append(dict(zip(proposal_topics_coefs_cols,row)), + ignore_index=True) + + +# In[35]: + + +related_props_df = pd.DataFrame(columns=related_props_cols) + +for idx,row in tqdm.tqdm(proposal_topics_coefs_df.iterrows(),total=len(proposal_topics_coefs_df)): + prop_related_temp = [] + prop_related_temp.append(int(row['id'])) + vectora = row['topic_coefs'] + distances = [np.linalg.norm(vectora-vectorb) for vectorb in proposal_topics_coefs_df['topic_coefs'].tolist()] + + # the vector contains also the id of the initial proposal, thus numb_related_proposals+1 + for i in np.array(distances).argsort()[0:numb_related_proposals+1]: + if distances[i] != 0.0: + prop_related_temp.append(int(proposals_input_df.loc[i,'id'])) + + # in case there are less related proposals than the max number + while len(prop_related_temp) < numb_related_proposals+1: + prop_related_temp.append('') + + related_props_df = related_props_df.append(dict(zip(related_props_cols,prop_related_temp)), ignore_index=True) + + +# In[36]: + + +related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False) +related_props_df.to_csv(os.path.join(data_path,related_props_filename_csv), index=False) + + +# In[ ]: + + + + + +# In[45]: + + +#proposal_topics_coefs_df + + +# In[46]: + + +#related_props_df + + +# In[44]: + + +logging.info('Script executed correctly.') + + +# In[ ]: + + + diff --git a/public/machine_learning/scripts/proposals_summary_comments_textrank.py b/public/machine_learning/scripts/proposals_summary_comments_textrank.py index 2af827979..440083558 100644 --- a/public/machine_learning/scripts/proposals_summary_comments_textrank.py +++ b/public/machine_learning/scripts/proposals_summary_comments_textrank.py @@ -5,55 +5,545 @@ """ -Proposals comments summaries - Dummy script +Proposals comments summaries +This script generates for each proposal a summary of all its comments. +Running time: Max 1 hour for 10.000 proposals. +Technique used: GloVe embeddings and TextRank. +More info in: https://github.com/consul-ml/consul-ml """ # In[2]: -data_path = '../data' -config_file = 'proposals_summary_comments_textrank.ini' -logging_file ='proposals_summary_comments_textrank.log' +# DOWNLOAD THE GLOVE EMBEDDINGS, IN THE DATA FOLDER: + +# ENGLISH: +#!wget https://nlp.stanford.edu/data/glove.6B.zip +#!gunzip glove.6B.zip + +# SPANISH: +#!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz +#!gunzip glove-sbwc*.gz + + +# In[ ]: + + +def check_file(file_name): + if os.path.isfile(file_name): + return + else: + try: + logging.info('Missing file in Proposals comments summaries: ' + str(file_name)) + except NameError: + print('No logging') + with open(os.path.join(data_path,comments_summaries_filename), 'w') as file: + file.write('[]') + os._exit(0) + + +# In[ ]: + + +# Input file: +inputjsonfile = 'comments.json' +col_id = 'commentable_id' +col_content = 'body' + +# Output files: +comments_summaries_filename = 'ml_comments_summaries_proposals.json' +comments_summaries_filename_csv = 'ml_comments_summaries_proposals.csv' + +tqdm_notebook = True # In[3]: -# Input file: -inputjsonfile = 'comments.json' - -# Output files: -comments_summaries_filename = 'ml_comments_summaries_proposals.json' - - -# In[4]: - +data_path = '../data' +config_file = 'proposals_summary_comments_textrank.ini' +logging_file ='proposals_summary_comments_textrank.log' +# Read the configuration file import os -import pandas as pd +import configparser +config = configparser.ConfigParser() +check_file(os.path.join(data_path,config_file)) +config.read(os.path.join(data_path,config_file)) +sent_token_lang = config['PREPROCESSING']['sent_token_lang'] +stopwords_lang = config['PREPROCESSING']['stopwords_lang'] +nltk_download = config['PREPROCESSING'].getboolean('nltk_download') + +if stopwords_lang == 'spanish': + glove_file = config['SUMMARISATION']['glove_file_es'] +if stopwords_lang == 'english': + glove_file = config['SUMMARISATION']['glove_file_en'] +threshold_factor = config['SUMMARISATION'].getfloat('threshold_factor') +max_size_of_summaries = config['SUMMARISATION'].getint('max_size_of_summaries') + +logging_level = config['LOGGING']['logging_level'] -# ### Read the comments # In[5]: -# comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records") -# col_id = 'commentable_id' -# col_content = 'body' -# comments_input_df = comments_input_df[[col_id]+[col_content]] +import logging +logging.basicConfig(filename=os.path.join(data_path,logging_file), + filemode='w', + format='%(asctime)s - %(levelname)s - %(message)s', + level=logging_level) +#logging.info('message') -# ### Create file. Comments summaries # In[6]: -comments_summaries_cols = ['id','commentable_id','commentable_type','body'] -comments_summaries_df = pd.DataFrame(columns=comments_summaries_cols) -row = [0,0,'Proposal','Summary'] -comments_summaries_df = comments_summaries_df.append(dict(zip(comments_summaries_cols,row)), ignore_index=True) -comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False) +import os +import pandas as pd +import numpy as np +import re +from unicodedata import normalize +import sys + + +# In[7]: + + +import nltk +if nltk_download: + nltk.download('stopwords') + nltk.download('punkt') + + +# In[8]: + + +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize, sent_tokenize + + +# In[9]: + + +from gensim.scripts.glove2word2vec import glove2word2vec +from gensim.models import KeyedVectors + + +# In[10]: + + +from sklearn.metrics.pairwise import cosine_similarity +import networkx as nx +import collections + + +# In[11]: + + +import tqdm +from tqdm.notebook import tqdm_notebook +tqdm_notebook.pandas() +# to use tqdm in pandas use progress_apply instead of apply + + +# In[12]: + + +# Different code for Spanish and English vectors +# Extract word vectors + +check_file(os.path.join(data_path,glove_file)) + +if stopwords_lang == 'english': + non_keyed_embs = os.path.join(data_path,glove_file) + keyed_embs = os.path.join(data_path,glove_file+'.vec') + if (not os.path.isfile(keyed_embs)) and (os.path.isfile(non_keyed_embs)): + glove2word2vec(non_keyed_embs, keyed_embs) + glove_file = glove_file+'.vec' + +word_embeddings = {} +f = open(os.path.join(data_path,glove_file), encoding='utf-8') +for line in f: + values = line.split() + word = values[0] + coefs = np.asarray(values[1:], dtype='float32') + word_embeddings[word] = coefs +f.close() + + +# In[ ]: + + + + + +# # Read the comments and join the comments belonging to the same proposal + +# In[ ]: + + +check_file(os.path.join(data_path,inputjsonfile)) +comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records") +comments_input_df = comments_input_df[comments_input_df['commentable_type'] == 'Proposal'] + + +# In[ ]: + + +# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS +if len(comments_input_df) == 0: + logging.info('No Proposals comments found to summarise.') + with open(os.path.join(data_path,comments_summaries_filename), 'w') as file: + file.write('[]') + os._exit(0) + + +# In[13]: + + +comments_input_df = comments_input_df[[col_id]+[col_content]] + +# Normalise characters +comments_input_df[col_content] = comments_input_df[col_content].apply(lambda x: normalize('NFKC',x)) + +comments_input_df = comments_input_df.sort_values(by=col_id) +comments_input_df.reset_index(drop=True,inplace=True) + + +# In[14]: + + +# Drop empty texts + +empty_txt_ids = [] +for idx,row in comments_input_df.iterrows(): + if row['body'].strip() == '': + empty_txt_ids.append(idx) + +comments_input_df = comments_input_df.drop(empty_txt_ids) +comments_input_df.reset_index(drop=True,inplace=True) + + +# In[15]: + + +comments_df = pd.DataFrame() + +temp_comments_joined = [] +temp_comments_number = [] +temp_proposal_id = [] +for prop_id in sorted(list(set(comments_input_df[col_id].tolist()))): + temp_list = comments_input_df[comments_input_df[col_id] == prop_id][col_content].tolist() + temp_comments_joined.append('\n'.join(temp_list)) + temp_comments_number.append(len(temp_list)) + temp_proposal_id.append(prop_id) + +comments_df['prop_id'] = temp_proposal_id +comments_df['comments_joined'] = temp_comments_joined +comments_df['comments_number'] = temp_comments_number + + +# In[16]: + + +# # Stats +# print(len(comments_df)) +# print(len(comments_df[(comments_df['comments_number'] >= 0) & (comments_df['comments_number'] < 10)])) +# print(len(comments_df[(comments_df['comments_number'] >= 10) & (comments_df['comments_number'] < 50)])) +# print(len(comments_df[(comments_df['comments_number'] >= 50) & (comments_df['comments_number'] < 900)])) + + +# In[ ]: + + + + + +# # Make comments lowercase + +# In[17]: + + +comments_df['comments_joined'] = comments_df['comments_joined'].apply(lambda x: x.lower()) + + +# In[ ]: + + + + + +# # Split sentences + +# In[18]: + + +def split_sentences(txt): + new_text_1 = sent_tokenize(txt,sent_token_lang) + #outputs [] if txt is ''; or made of ' ' or '\n' + + new_text_2 = [] + if new_text_1 != []: + for tok1 in new_text_1: + new_text_2 += tok1.split('\n') + #outputs [''] if txt is '' + new_text_2 = [tok.strip() for tok in new_text_2 if tok.strip() != ''] + + if new_text_2 == []: + new_text_2 = [''] + + return new_text_2 + + +# In[19]: + + +comments_df['comments_sentences'] = comments_df['comments_joined'].apply(split_sentences) + + +# In[ ]: + + + + + +# # Calculate sentence embeddings + +# In[20]: + + +# Includes some extra steps for Spanish +# List of stop words to be removed +stop_words = set(stopwords.words(stopwords_lang)) + +if stopwords_lang == 'spanish': + for word in stop_words: + stop_words = stop_words.union({re.sub(r"á","a",word)}) + stop_words = stop_words.union({re.sub(r"é","e",word)}) + stop_words = stop_words.union({re.sub(r"í","i",word)}) + stop_words = stop_words.union({re.sub(r"ó","o",word)}) + stop_words = stop_words.union({re.sub(r"ú","u",word)}) + +# additional terms removed when found as an independent character +if stopwords_lang == 'spanish': + additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'} +else: + additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'} +all_stop_words = stop_words.union(additional_stop_words) + + +# In[21]: + + +def sentences_embeddings(sents): + sent_embs = [] + + for sent in sents: + words = set(word_tokenize(sent)) + words = words-all_stop_words + if len(words) != 0: + emb = sum([word_embeddings.get(word, np.zeros(300)) for word in words])/( + len(words)+0.001) + else: + emb = np.zeros(300) + sent_embs.append(emb) + + return sent_embs + + +# In[22]: + + +if tqdm_notebook: + comments_df['comments_sentences_embeddings'] = comments_df[ + 'comments_sentences'].progress_apply(sentences_embeddings) +else: + comments_df['comments_sentences_embeddings'] = comments_df[ + 'comments_sentences'].apply(sentences_embeddings) + + +# In[ ]: + + + + + +# # Calculate sentence scores + +# In[23]: + + +def sentences_scores(sents, sent_embs): + + # similarity matrix + if len(sent_embs) > 1: + stacked_sent_embs = np.stack(sent_embs) + sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs) + np.fill_diagonal(sim_mat, 0) + elif len(sent_embs) == 1: + sim_mat = np.array([[0.]]) + else: + return collections.OrderedDict([('',1.0)]) + + nx_graph = nx.from_numpy_array(sim_mat) + + try: + sentence_weight_temp = nx.pagerank(nx_graph) + except: + sentence_weight_temp = dict.fromkeys([x for x in range(len(sents))], 0) + + sentence_weights = {sents[key]: value for key, value in sentence_weight_temp.items()} + + sorted_sentence_weights = sorted(sentence_weights.items(), key=lambda elem: elem[1], reverse=True) + sentence_scores = collections.OrderedDict(sorted_sentence_weights) + + return sentence_scores + + +# In[24]: + + +def plot_sentences_network(sents, sent_embs): + import matplotlib.pyplot as plt + + # similarity matrix + if len(sent_embs) > 1: + stacked_sent_embs = np.stack(sent_embs) + sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs) + np.fill_diagonal(sim_mat, 0) + elif len(sent_embs) == 1: + sim_mat = np.array([[0.]]) + else: + print('Nothing to plot') + return + + nx_graph = nx.from_numpy_array(sim_mat) + + plt.plot() + nx.draw(nx_graph, with_labels=True) + + +# In[25]: + + +comments_df['comments_sentences_scores'] = comments_df[['comments_sentences','comments_sentences_embeddings']].progress_apply( + lambda row: sentences_scores(row['comments_sentences'],row['comments_sentences_embeddings']),axis=1) + + +# In[ ]: + + + + + +# # Generate the summaries + +# In[26]: + + +def comments_summary(sentence_weight, threshold_factor, *totalwords): + + threshold = threshold_factor * np.mean(list(sentence_weight.values())) + + sentence_counter = 0 + comments_summary = '' + + summary_num_words = 0 + + for sentence in sentence_weight: + if sentence_weight[sentence] >= (threshold): + if len(totalwords) == 0: + comments_summary += "\n- " + sentence + sentence_counter += 1 + elif summary_num_words < totalwords[0]: + comments_summary += "\n- " + sentence + sentence_counter += 1 + summary_num_words += len(sentence.split()) + + comments_summary = comments_summary.lstrip() + return comments_summary + + +# In[27]: + + +comments_df['comments_summary'] = comments_df['comments_sentences_scores'].apply( + lambda x: comments_summary(x,threshold_factor,max_size_of_summaries)) + + +# In[28]: + + +# comments_df + + +# In[29]: + + +# for idx,row in comments_input_df[comments_input_df['commentable_id'] == 10].iterrows(): +# print(row['body']) +# print('-------') + + +# In[30]: + + +#print(comments_df.loc[8,'comments_summary']) + + +# In[ ]: + + + + + +# In[31]: + + +comments_df['commentable_type'] = ['Proposal']*len(comments_df) +comments_summaries_df = comments_df[['prop_id','commentable_type','comments_summary']] +comments_summaries_df.reset_index(level=0, inplace=True) + + +# In[32]: + + +comments_summaries_df = comments_summaries_df.rename( + columns={"index": "id", "prop_id": "commentable_id", "comments_summary": "body"}) + + +# In[33]: + + +#comments_summaries_df + + +# In[34]: + + +comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False) +comments_summaries_df.to_csv(os.path.join(data_path,comments_summaries_filename_csv), index=False) + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# In[35]: + + +logging.info('Script executed correctly.')