Update machine learning scripts with NNMF and TextRank-GloVe techniques
This commit is contained in:
@@ -5,11 +5,63 @@
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Related Participatory Budgeting projects and Tags - Dummy script
|
Related Participatory Budgeting projects and Tags
|
||||||
|
|
||||||
|
This script generates for each project: a) Tags, b) List of related projects.
|
||||||
|
Running time: Max 2 hours for 10.000 projects.
|
||||||
|
Technique used: NNMF and Euclidean distance between projects.
|
||||||
|
More info in: https://github.com/consul-ml/consul-ml
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def check_file(file_name):
|
||||||
|
if os.path.isfile(file_name):
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
logging.info('Missing file in Related Participatory Budgeting projects and Tags: ' + str(file_name))
|
||||||
|
except NameError:
|
||||||
|
print('No logging')
|
||||||
|
with open(os.path.join(data_path,taggings_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
with open(os.path.join(data_path,tags_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
with open(os.path.join(data_path,related_props_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# Input file:
|
||||||
|
inputjsonfile = 'budget_investments.json'
|
||||||
|
col_id = 'id'
|
||||||
|
col_title = 'title'
|
||||||
|
cols_content = ['title','description']
|
||||||
|
|
||||||
|
# Output files:
|
||||||
|
topics_tags_filename = 'ml_topics_tags_budgets.json'
|
||||||
|
topics_tags_filename_csv = 'ml_topics_tags_budgets.csv'
|
||||||
|
|
||||||
|
repr_prop_filename = 'ml_repr_budgets.json'
|
||||||
|
repr_prop_filename_csv = 'ml_repr_budgets.csv'
|
||||||
|
|
||||||
|
taggings_filename = 'ml_taggings_budgets.json'
|
||||||
|
taggings_filename_csv = 'ml_taggings_budgets.csv'
|
||||||
|
|
||||||
|
tags_filename = 'ml_tags_budgets.json'
|
||||||
|
tags_filename_csv = 'ml_tags_budgets.csv'
|
||||||
|
|
||||||
|
related_props_filename = 'ml_related_content_budgets.json'
|
||||||
|
related_props_filename_csv = 'ml_related_content_budgets.csv'
|
||||||
|
|
||||||
|
tqdm_notebook = True
|
||||||
|
|
||||||
|
|
||||||
# In[2]:
|
# In[2]:
|
||||||
|
|
||||||
|
|
||||||
@@ -17,70 +69,691 @@ data_path = '../data'
|
|||||||
config_file = 'budgets_related_content_and_tags_nmf.ini'
|
config_file = 'budgets_related_content_and_tags_nmf.ini'
|
||||||
logging_file ='budgets_related_content_and_tags_nmf.log'
|
logging_file ='budgets_related_content_and_tags_nmf.log'
|
||||||
|
|
||||||
|
# Read the configuration file
|
||||||
|
import os
|
||||||
|
import configparser
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
check_file(os.path.join(data_path,config_file))
|
||||||
|
config.read(os.path.join(data_path,config_file))
|
||||||
|
|
||||||
|
stanza_model_lang = config['PREPROCESSING']['stanza_model_lang']
|
||||||
|
stopwords_lang = config['PREPROCESSING']['stopwords_lang']
|
||||||
|
noun_lemmatisation = config['PREPROCESSING'].getboolean('noun_lemmatisation')
|
||||||
|
n_gram_min_count = config['PREPROCESSING'].getint('n_gram_min_count')
|
||||||
|
stanza_download = config['PREPROCESSING'].getboolean('stanza_download')
|
||||||
|
nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
|
||||||
|
|
||||||
|
numb_related_proposals = config['RELATED_PROPOSALS'].getint('numb_related_proposals')
|
||||||
|
|
||||||
|
numb_topics = config['TOPIC_MODELLING'].getint('numb_topics')
|
||||||
|
numb_topkeywords_pertopic = config['TOPIC_MODELLING'].getint('numb_topkeywords_pertopic')
|
||||||
|
n_top_represent_props = config['TOPIC_MODELLING'].getint('n_top_represent_props')
|
||||||
|
n_features = config['TOPIC_MODELLING'].getint('n_features')
|
||||||
|
min_df_val = config['TOPIC_MODELLING'].getfloat('min_df_val')
|
||||||
|
max_df_val = config['TOPIC_MODELLING'].getfloat('max_df_val')
|
||||||
|
|
||||||
|
logging_level = config['LOGGING']['logging_level']
|
||||||
|
|
||||||
|
|
||||||
# In[3]:
|
# In[3]:
|
||||||
|
|
||||||
|
|
||||||
# Input file:
|
related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)]
|
||||||
inputjsonfile = 'budget_investments.json'
|
|
||||||
|
|
||||||
# Output files:
|
repr_prop_cols = ['topic_id','proposal_id','title']
|
||||||
taggings_filename = 'ml_taggings_budgets.json'
|
tags_file_cols = ['id','name','taggings_count','kind']
|
||||||
tags_filename = 'ml_tags_budgets.json'
|
taggings_file_cols = ['tag_id','taggable_id','taggable_type']
|
||||||
related_props_filename = 'ml_related_content_budgets.json'
|
tag_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)]
|
||||||
|
|
||||||
|
tags_file_cols_count = 'taggings_count'
|
||||||
|
taggings_file_cols_id = 'tag_id'
|
||||||
|
|
||||||
|
|
||||||
# In[4]:
|
# In[4]:
|
||||||
|
|
||||||
|
|
||||||
import os
|
import logging
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
logging.basicConfig(filename=os.path.join(data_path,logging_file),
|
||||||
|
filemode='w',
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
level=logging_level)
|
||||||
|
#logging.info('message')
|
||||||
|
|
||||||
# ### Read the proposals
|
|
||||||
|
|
||||||
# In[5]:
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
# proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
import os
|
||||||
# col_id = 'id'
|
import re
|
||||||
# cols_content = ['title','description']
|
import numpy as np
|
||||||
# proposals_input_df = proposals_input_df[[col_id]+cols_content]
|
import pandas as pd
|
||||||
|
from unicodedata import normalize
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
# ### Create file: Taggings. Each line is a Tag associated to a Proposal
|
|
||||||
|
|
||||||
# In[6]:
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
taggings_file_cols = ['tag_id','taggable_id','taggable_type']
|
import stanza
|
||||||
taggings_file_df = pd.DataFrame(columns=taggings_file_cols)
|
if stanza_download:
|
||||||
row = [0,1,'Budget::Investment']
|
stanza.download(stanza_model_lang)
|
||||||
taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True)
|
|
||||||
taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False)
|
|
||||||
|
|
||||||
|
# IF NEEDED define 'pos_batch_size': 10000 in the next cell, config options.
|
||||||
|
config = {
|
||||||
|
'processors': 'tokenize,mwt,pos,lemma',
|
||||||
|
'lang': stanza_model_lang
|
||||||
|
}
|
||||||
|
#not using depparse
|
||||||
|
nlp = stanza.Pipeline(**config)
|
||||||
|
|
||||||
# ### Create file: Tags. List of Tags with the number of times they have been used
|
|
||||||
|
|
||||||
# In[7]:
|
# In[7]:
|
||||||
|
|
||||||
|
|
||||||
tags_file_cols = ['id','name','taggings_count','kind']
|
import tqdm
|
||||||
tags_file_df = pd.DataFrame(columns=tags_file_cols)
|
from tqdm.notebook import tqdm_notebook
|
||||||
row = [0,'tag',0,'']
|
tqdm_notebook.pandas()
|
||||||
tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True)
|
# to use tqdm in pandas use progress_apply instead of apply
|
||||||
tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False)
|
|
||||||
|
|
||||||
|
|
||||||
# ### Create file: List of related proposals
|
|
||||||
|
|
||||||
# In[8]:
|
# In[8]:
|
||||||
|
|
||||||
|
|
||||||
numb_related_proposals = 2
|
import nltk
|
||||||
related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)]
|
if nltk_download:
|
||||||
related_props_df = pd.DataFrame(columns=related_props_cols)
|
nltk.download('stopwords')
|
||||||
row = [1]+['' for num in range(1,numb_related_proposals+1)]
|
nltk.download('punkt')
|
||||||
related_props_df = related_props_df.append(dict(zip(related_props_cols,row)), ignore_index=True)
|
|
||||||
related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False)
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize, sent_tokenize
|
||||||
|
|
||||||
|
|
||||||
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
|
import gensim
|
||||||
|
from gensim.models.phrases import Phrases, Phraser
|
||||||
|
|
||||||
|
|
||||||
|
# In[10]:
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.decomposition import NMF
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Read the proposals and join the content to use in the topic modelling
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
check_file(os.path.join(data_path,inputjsonfile))
|
||||||
|
proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
||||||
|
proposals_input_df = proposals_input_df[[col_id]+cols_content]
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
|
||||||
|
if len(proposals_input_df) == 0:
|
||||||
|
logging.info('No Proposals found.')
|
||||||
|
with open(os.path.join(data_path,taggings_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
with open(os.path.join(data_path,tags_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
with open(os.path.join(data_path,related_props_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# In[11]:
|
||||||
|
|
||||||
|
|
||||||
|
# Normalise characters
|
||||||
|
for col in cols_content:
|
||||||
|
proposals_input_df[col] = proposals_input_df[col].apply(lambda x: normalize('NFKC',x))
|
||||||
|
|
||||||
|
proposals_input_df['joined_content'] = proposals_input_df[cols_content].agg('\n'.join, axis=1)
|
||||||
|
proposals_input_df = proposals_input_df.drop(columns=list(set(cols_content)-{col_title}))
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Lemmatise the content
|
||||||
|
|
||||||
|
# In[12]:
|
||||||
|
|
||||||
|
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = proposals_input_df['joined_content']
|
||||||
|
|
||||||
|
|
||||||
|
# In[13]:
|
||||||
|
|
||||||
|
|
||||||
|
# Using Stanza from Stanford NLP group
|
||||||
|
def content_processing_for_topicmodelling_1(txt):
|
||||||
|
|
||||||
|
# Delete html tags and urls
|
||||||
|
tmp_txt = re.sub("<[^<]+?>","",txt)
|
||||||
|
tmp_txt = re.sub(r"http[^\s]+?\s","",tmp_txt)
|
||||||
|
tmp_txt = re.sub(r"http[^\s]+?$","",tmp_txt)
|
||||||
|
tmp_txt = re.sub(r"www[^\s]+?\s","",tmp_txt)
|
||||||
|
tmp_txt = re.sub(r"www[^\s]+?$","",tmp_txt)
|
||||||
|
|
||||||
|
# Tokenise, lemmatise and select only the nouns
|
||||||
|
new_txt_tok = []
|
||||||
|
if len(re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ]","",tmp_txt).rstrip("\n")) != 0:
|
||||||
|
tmp_txt_nlp = nlp(tmp_txt)
|
||||||
|
|
||||||
|
for sent in tmp_txt_nlp.sentences:
|
||||||
|
for token in sent.words:
|
||||||
|
if noun_lemmatisation:
|
||||||
|
if token.upos == 'NOUN':
|
||||||
|
new_txt_tok.append(token.lemma)
|
||||||
|
else:
|
||||||
|
new_txt_tok.append(token.text)
|
||||||
|
|
||||||
|
return new_txt_tok
|
||||||
|
|
||||||
|
|
||||||
|
# In[14]:
|
||||||
|
|
||||||
|
|
||||||
|
if tqdm_notebook:
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
|
||||||
|
'joined_content_topicmodelling'].progress_apply(content_processing_for_topicmodelling_1)
|
||||||
|
else:
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
|
||||||
|
'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_1)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# # Clean the data
|
||||||
|
|
||||||
|
# In[15]:
|
||||||
|
|
||||||
|
|
||||||
|
# Includes some extra steps for Spanish
|
||||||
|
# List of stop words to be removed
|
||||||
|
stop_words = set(stopwords.words(stopwords_lang))
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
for word in stop_words:
|
||||||
|
stop_words = stop_words.union({re.sub(r"á","a",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"é","e",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"í","i",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"ó","o",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"ú","u",word)})
|
||||||
|
|
||||||
|
# additional terms removed when found as an independent character
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
|
||||||
|
else:
|
||||||
|
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
|
||||||
|
all_stop_words = stop_words.union(additional_stop_words)
|
||||||
|
|
||||||
|
|
||||||
|
# In[16]:
|
||||||
|
|
||||||
|
|
||||||
|
def content_processing_for_topicmodelling_2(txt_tok):
|
||||||
|
new_text_tok = []
|
||||||
|
for word in txt_tok:
|
||||||
|
new_word = word.lower()
|
||||||
|
new_word = re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ\s]","",new_word)
|
||||||
|
new_word = re.sub(r"[0-9]+","",new_word)
|
||||||
|
new_word = new_word.rstrip("\n")
|
||||||
|
if (len(new_word) != 0) and (new_word not in all_stop_words):
|
||||||
|
new_text_tok.append(new_word)
|
||||||
|
|
||||||
|
return new_text_tok
|
||||||
|
|
||||||
|
|
||||||
|
# In[17]:
|
||||||
|
|
||||||
|
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
|
||||||
|
'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_2)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Detect n-grams
|
||||||
|
|
||||||
|
# In[18]:
|
||||||
|
|
||||||
|
|
||||||
|
txt_unigram = proposals_input_df['joined_content_topicmodelling'].tolist()
|
||||||
|
|
||||||
|
phrases_bigrams = Phrases(txt_unigram, min_count=n_gram_min_count)
|
||||||
|
txt_bigram = [phrases_bigrams[txt] for txt in txt_unigram]
|
||||||
|
txt_bigram_joined = [' '.join(txt) for txt in txt_bigram]
|
||||||
|
|
||||||
|
# may contain also cuadrigrams when joining 2 bigrams:
|
||||||
|
# phrases_trigrams = Phrases(txt_bigram, min_count=n_gram_min_count)
|
||||||
|
# txt_trigram = [phrases_trigrams[txt] for txt in txt_bigram]
|
||||||
|
# txt_trigram_joined = [' '.join(txt) for txt in txt_trigram]
|
||||||
|
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = txt_bigram_joined
|
||||||
|
# proposals_input_df['joined_content_topicmodelling'] = txt_trigram_joined
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Topic modelling (NMF)
|
||||||
|
|
||||||
|
# In[19]:
|
||||||
|
|
||||||
|
|
||||||
|
df_col_to_use = proposals_input_df['joined_content_topicmodelling']
|
||||||
|
|
||||||
|
# NUMBER OF TOPICS
|
||||||
|
n_components = numb_topics
|
||||||
|
# SELECT the TOP n_top_words WORDS for each topic
|
||||||
|
n_top_words = numb_topkeywords_pertopic
|
||||||
|
|
||||||
|
# Use tf-idf features for NMF
|
||||||
|
tfidf_vectorizer = TfidfVectorizer(max_df=max_df_val, min_df=min_df_val,
|
||||||
|
max_features=n_features)
|
||||||
|
|
||||||
|
tfidf = tfidf_vectorizer.fit_transform(df_col_to_use.tolist())
|
||||||
|
|
||||||
|
|
||||||
|
# In[20]:
|
||||||
|
|
||||||
|
|
||||||
|
# Includes some extra steps for Spanish
|
||||||
|
def cleaning_features(top_features):
|
||||||
|
clean_features = top_features.copy()
|
||||||
|
for feature in clean_features:
|
||||||
|
if feature+'s' in clean_features: clean_features[max(
|
||||||
|
clean_features.index(feature),clean_features.index(feature+'s'))] = ''
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
for feature in clean_features:
|
||||||
|
if feature+'es' in clean_features: clean_features[max(
|
||||||
|
clean_features.index(feature),clean_features.index(feature+'es'))] = ''
|
||||||
|
for feature in clean_features:
|
||||||
|
if feature+'r' in clean_features: clean_features[max(
|
||||||
|
clean_features.index(feature),clean_features.index(feature+'r'))] = ''
|
||||||
|
|
||||||
|
nosign_features = clean_features.copy()
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"á","a",fet)
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"é","e",fet)
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"í","i",fet)
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"ó","o",fet)
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"ú","u",fet)
|
||||||
|
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
if fet in nosign_features[pos+1:]:
|
||||||
|
clean_features[max(pos_2 for pos_2,fet_2 in enumerate(nosign_features) if fet_2 == fet)] = ''
|
||||||
|
|
||||||
|
return clean_features
|
||||||
|
|
||||||
|
|
||||||
|
# Fit the NMF model
|
||||||
|
nmf = NMF(n_components=n_components, random_state=1,
|
||||||
|
alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
|
||||||
|
|
||||||
|
# nmf.components_ is the H matrix
|
||||||
|
# W = nmf.fit_transform(tfidf)
|
||||||
|
|
||||||
|
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
|
||||||
|
|
||||||
|
# Size of the vocabulary and the nmf matrix
|
||||||
|
#print(len(tfidf_vectorizer.vocabulary_))
|
||||||
|
#print(len(tfidf_feature_names))
|
||||||
|
#nmf.components_.shape
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ### Create file: Repr_Prop. Most representative proposal for each topic
|
||||||
|
|
||||||
|
# In[21]:
|
||||||
|
|
||||||
|
|
||||||
|
W = nmf.fit_transform(tfidf)
|
||||||
|
#print(W.shape)
|
||||||
|
|
||||||
|
repr_prop_df = pd.DataFrame(columns=repr_prop_cols)
|
||||||
|
|
||||||
|
for topic_index in range(n_components):
|
||||||
|
top_indices = np.argsort( W[:,topic_index] )[::-1]
|
||||||
|
top_represent_proposals = []
|
||||||
|
for proposal_index in top_indices[0:n_top_represent_props]:
|
||||||
|
top_represent_proposals.append(proposal_index)
|
||||||
|
|
||||||
|
for prop_internal_index in top_represent_proposals:
|
||||||
|
row = [topic_index,
|
||||||
|
proposals_input_df.loc[int(prop_internal_index),'id'],
|
||||||
|
proposals_input_df.loc[int(prop_internal_index),'title']]
|
||||||
|
repr_prop_df = repr_prop_df.append(dict(zip(repr_prop_cols,row)), ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[22]:
|
||||||
|
|
||||||
|
|
||||||
|
repr_prop_df.to_json(os.path.join(data_path,repr_prop_filename),orient="records", force_ascii=False)
|
||||||
|
repr_prop_df.to_csv(os.path.join(data_path,repr_prop_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ### Create file: Topics_Tags. List of Topics with their top Tags
|
||||||
|
|
||||||
|
# In[23]:
|
||||||
|
|
||||||
|
|
||||||
|
topics_tags_df = pd.DataFrame(columns=['id']+tag_cols)
|
||||||
|
|
||||||
|
# nmf.components_ is the H matrix
|
||||||
|
|
||||||
|
for topic_idx, topic in enumerate(nmf.components_):
|
||||||
|
obj_temp = [tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
|
||||||
|
clean_obj_temp = cleaning_features(obj_temp)
|
||||||
|
clean_obj_temp.insert(0, str(topic_idx))
|
||||||
|
#print(clean_obj_temp)
|
||||||
|
topics_tags_df = topics_tags_df.append(dict(zip(['id']+tag_cols,clean_obj_temp)), ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[24]:
|
||||||
|
|
||||||
|
|
||||||
|
topics_tags_df.to_json(os.path.join(data_path,topics_tags_filename),orient="records", force_ascii=False)
|
||||||
|
topics_tags_df.to_csv(os.path.join(data_path,topics_tags_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ### Create file: Taggings. Each line is a Tag associated to a Proposal
|
||||||
|
|
||||||
|
# In[25]:
|
||||||
|
|
||||||
|
|
||||||
|
# Coefficients for following calculation
|
||||||
|
tag_coefs_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)]
|
||||||
|
topics_tags_coefs_df = pd.DataFrame(columns=['id']+tag_coefs_cols)
|
||||||
|
|
||||||
|
# nmf.components_ is the H matrix
|
||||||
|
|
||||||
|
for topic_idx, topic in enumerate(nmf.components_):
|
||||||
|
topics_tags_coefs_temp = []
|
||||||
|
topics_tags_coefs_temp.append(int(topic_idx))
|
||||||
|
for i in topic.argsort()[:-n_top_words - 1:-1]:
|
||||||
|
topics_tags_coefs_temp.append(topic[i])
|
||||||
|
topics_tags_coefs_df = topics_tags_coefs_df.append(dict(zip(['id']+tag_coefs_cols,
|
||||||
|
topics_tags_coefs_temp)), ignore_index=True)
|
||||||
|
|
||||||
|
for col in tag_cols:
|
||||||
|
for topic_idx,topic in enumerate(topics_tags_df[col].tolist()):
|
||||||
|
if topic == '':
|
||||||
|
topics_tags_coefs_df.loc[int(topic_idx),col] = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
# In[26]:
|
||||||
|
|
||||||
|
|
||||||
|
topics_tags_flat = []
|
||||||
|
for idx,topic in topics_tags_df.iterrows():
|
||||||
|
topics_tags_flat = topics_tags_flat + topics_tags_df.loc[idx,tag_cols].tolist()
|
||||||
|
|
||||||
|
topics_tags_coefs_flat = []
|
||||||
|
for idx,topic in topics_tags_coefs_df.iterrows():
|
||||||
|
topics_tags_coefs_flat = topics_tags_coefs_flat + topics_tags_coefs_df.loc[idx,tag_coefs_cols].tolist()
|
||||||
|
|
||||||
|
|
||||||
|
# In[27]:
|
||||||
|
|
||||||
|
|
||||||
|
taggings_file_df = pd.DataFrame(columns=taggings_file_cols)
|
||||||
|
|
||||||
|
for prop_idx,prop in tqdm.tqdm(enumerate(W),total=len(W)):
|
||||||
|
proposal_topics_temp = np.zeros((len(topics_tags_flat)))
|
||||||
|
cont = 0
|
||||||
|
for weight in prop:
|
||||||
|
for n in range(n_top_words):
|
||||||
|
proposal_topics_temp[cont] = weight
|
||||||
|
cont += 1
|
||||||
|
|
||||||
|
proposal_tags_temp = proposal_topics_temp*topics_tags_coefs_flat
|
||||||
|
|
||||||
|
# Adding the coefficients of same tags:
|
||||||
|
for numterm_a,term_a in enumerate(topics_tags_flat):
|
||||||
|
for numterm_b in reversed(range(numterm_a+1,len(topics_tags_flat))):
|
||||||
|
term_b = topics_tags_flat[numterm_b]
|
||||||
|
if (term_a == term_b):
|
||||||
|
proposal_tags_temp[numterm_a] = proposal_tags_temp[numterm_a] + proposal_tags_temp[numterm_b]
|
||||||
|
proposal_tags_temp[numterm_b] = 0
|
||||||
|
|
||||||
|
for i in proposal_tags_temp.argsort()[:-n_top_words - 1:-1]:
|
||||||
|
row = [i,proposals_input_df.loc[prop_idx,'id'],'Budget::Investment']
|
||||||
|
taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ### Create file: Tags. List of Tags with the number of times they have been used
|
||||||
|
|
||||||
|
# In[28]:
|
||||||
|
|
||||||
|
|
||||||
|
tags_file_df = pd.DataFrame(columns=tags_file_cols)
|
||||||
|
|
||||||
|
for tag_id,tag in enumerate(topics_tags_flat):
|
||||||
|
row = [tag_id,tag,0,'']
|
||||||
|
tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True)
|
||||||
|
|
||||||
|
for tag_id in taggings_file_df[taggings_file_cols_id].tolist():
|
||||||
|
tags_file_df.loc[tag_id,tags_file_cols_count] = tags_file_df.loc[tag_id,tags_file_cols_count]+1
|
||||||
|
|
||||||
|
|
||||||
|
# ### Deleting duplicate tags from files Tag and Taggings before saving them
|
||||||
|
|
||||||
|
# In[29]:
|
||||||
|
|
||||||
|
|
||||||
|
change_rows = []
|
||||||
|
repeated_ids = []
|
||||||
|
for idx1,row1 in tags_file_df.iterrows():
|
||||||
|
for idx2,row2 in tags_file_df.iterrows():
|
||||||
|
if (idx2 > idx1) and (idx2 not in repeated_ids) and (row1['name'] == row2['name']):
|
||||||
|
change_rows.append((idx1,idx2))
|
||||||
|
repeated_ids.append(idx2)
|
||||||
|
|
||||||
|
tags_file_df = tags_file_df.drop(repeated_ids)
|
||||||
|
|
||||||
|
|
||||||
|
# In[30]:
|
||||||
|
|
||||||
|
|
||||||
|
for c_row in change_rows:
|
||||||
|
taggings_file_df['tag_id'] = taggings_file_df['tag_id'].apply(lambda x: c_row[0] if x == c_row[1] else x)
|
||||||
|
|
||||||
|
|
||||||
|
# In[31]:
|
||||||
|
|
||||||
|
|
||||||
|
tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False)
|
||||||
|
tags_file_df.to_csv(os.path.join(data_path,tags_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[32]:
|
||||||
|
|
||||||
|
|
||||||
|
taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False)
|
||||||
|
taggings_file_df.to_csv(os.path.join(data_path,taggings_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[33]:
|
||||||
|
|
||||||
|
|
||||||
|
# proposals_input_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[34]:
|
||||||
|
|
||||||
|
|
||||||
|
# repr_prop_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[35]:
|
||||||
|
|
||||||
|
|
||||||
|
# topics_tags_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[36]:
|
||||||
|
|
||||||
|
|
||||||
|
# taggings_file_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[37]:
|
||||||
|
|
||||||
|
|
||||||
|
# tags_file_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # LIST OF RELATED PROPOSALS
|
||||||
|
|
||||||
|
# In[38]:
|
||||||
|
|
||||||
|
|
||||||
|
proposal_topics_coefs_cols = ['id','topic_coefs']
|
||||||
|
proposal_topics_coefs_df = pd.DataFrame(columns=proposal_topics_coefs_cols)
|
||||||
|
|
||||||
|
for prop_idx,prop in enumerate(W):
|
||||||
|
row = [proposals_input_df.loc[prop_idx,'id'],prop.copy()]
|
||||||
|
proposal_topics_coefs_df = proposal_topics_coefs_df.append(dict(zip(proposal_topics_coefs_cols,row)),
|
||||||
|
ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[39]:
|
||||||
|
|
||||||
|
|
||||||
|
related_props_df = pd.DataFrame(columns=related_props_cols)
|
||||||
|
|
||||||
|
for idx,row in tqdm.tqdm(proposal_topics_coefs_df.iterrows(),total=len(proposal_topics_coefs_df)):
|
||||||
|
prop_related_temp = []
|
||||||
|
prop_related_temp.append(int(row['id']))
|
||||||
|
vectora = row['topic_coefs']
|
||||||
|
distances = [np.linalg.norm(vectora-vectorb) for vectorb in proposal_topics_coefs_df['topic_coefs'].tolist()]
|
||||||
|
|
||||||
|
# the vector contains also the id of the initial proposal, thus numb_related_proposals+1
|
||||||
|
for i in np.array(distances).argsort()[0:numb_related_proposals+1]:
|
||||||
|
if distances[i] != 0.0:
|
||||||
|
prop_related_temp.append(int(proposals_input_df.loc[i,'id']))
|
||||||
|
|
||||||
|
# in case there are less related proposals than the max number
|
||||||
|
while len(prop_related_temp) < numb_related_proposals+1:
|
||||||
|
prop_related_temp.append('')
|
||||||
|
|
||||||
|
related_props_df = related_props_df.append(dict(zip(related_props_cols,prop_related_temp)), ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[40]:
|
||||||
|
|
||||||
|
|
||||||
|
related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False)
|
||||||
|
related_props_df.to_csv(os.path.join(data_path,related_props_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[41]:
|
||||||
|
|
||||||
|
|
||||||
|
#proposal_topics_coefs_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[42]:
|
||||||
|
|
||||||
|
|
||||||
|
#related_props_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[43]:
|
||||||
|
|
||||||
|
|
||||||
|
logging.info('Script executed correctly.')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,59 +1,549 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
# In[1]:
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Participatory Budgeting comments summaries - Dummy script
|
Participatory Budgeting comments summaries
|
||||||
|
|
||||||
|
This script generates for each budget project a summary of all its comments.
|
||||||
|
Running time: Max 1 hour for 10.000 proposals.
|
||||||
|
Technique used: GloVe embeddings and TextRank.
|
||||||
|
More info in: https://github.com/consul-ml/consul-ml
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# In[2]:
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# DOWNLOAD THE GLOVE EMBEDDINGS, IN THE DATA FOLDER:
|
||||||
|
|
||||||
|
# ENGLISH:
|
||||||
|
#!wget https://nlp.stanford.edu/data/glove.6B.zip
|
||||||
|
#!gunzip glove.6B.zip
|
||||||
|
|
||||||
|
# SPANISH:
|
||||||
|
#!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz
|
||||||
|
#!gunzip glove-sbwc*.gz
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def check_file(file_name):
|
||||||
|
if os.path.isfile(file_name):
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
logging.info('Missing file in Participatory Budgeting comments summaries: ' + str(file_name))
|
||||||
|
except NameError:
|
||||||
|
print('No logging')
|
||||||
|
with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# Input file:
|
||||||
|
inputjsonfile = 'comments.json'
|
||||||
|
col_id = 'commentable_id'
|
||||||
|
col_content = 'body'
|
||||||
|
|
||||||
|
# Output files:
|
||||||
|
comments_summaries_filename = 'ml_comments_summaries_budgets.json'
|
||||||
|
comments_summaries_filename_csv = 'ml_comments_summaries_budgets.csv'
|
||||||
|
|
||||||
|
tqdm_notebook = True
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
data_path = '../data'
|
data_path = '../data'
|
||||||
config_file = 'budgets_summary_comments_textrank.ini'
|
config_file = 'budgets_summary_comments_textrank.ini'
|
||||||
logging_file ='budgets_summary_comments_textrank.log'
|
logging_file ='budgets_summary_comments_textrank.log'
|
||||||
|
|
||||||
|
# Read the configuration file
|
||||||
|
import os
|
||||||
|
import configparser
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
check_file(os.path.join(data_path,config_file))
|
||||||
|
config.read(os.path.join(data_path,config_file))
|
||||||
|
|
||||||
# In[3]:
|
sent_token_lang = config['PREPROCESSING']['sent_token_lang']
|
||||||
|
stopwords_lang = config['PREPROCESSING']['stopwords_lang']
|
||||||
|
nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
glove_file = config['SUMMARISATION']['glove_file_es']
|
||||||
|
if stopwords_lang == 'english':
|
||||||
|
glove_file = config['SUMMARISATION']['glove_file_en']
|
||||||
|
threshold_factor = config['SUMMARISATION'].getfloat('threshold_factor')
|
||||||
|
max_size_of_summaries = config['SUMMARISATION'].getint('max_size_of_summaries')
|
||||||
|
|
||||||
|
logging_level = config['LOGGING']['logging_level']
|
||||||
|
|
||||||
|
|
||||||
# Input file:
|
# In[ ]:
|
||||||
inputjsonfile = 'comments.json'
|
|
||||||
|
|
||||||
# Output files:
|
|
||||||
comments_summaries_filename = 'ml_comments_summaries_budgets.json'
|
|
||||||
|
|
||||||
|
|
||||||
# In[4]:
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig(filename=os.path.join(data_path,logging_file),
|
||||||
|
filemode='w',
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
level=logging_level)
|
||||||
|
#logging.info('message')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import re
|
||||||
|
from unicodedata import normalize
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
# ### Read the comments
|
# In[ ]:
|
||||||
|
|
||||||
# In[5]:
|
|
||||||
|
|
||||||
|
|
||||||
# comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
import nltk
|
||||||
# col_id = 'commentable_id'
|
if nltk_download:
|
||||||
# col_content = 'body'
|
nltk.download('stopwords')
|
||||||
# comments_input_df = comments_input_df[[col_id]+[col_content]]
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
|
||||||
# ### Create file. Comments summaries
|
# In[ ]:
|
||||||
|
|
||||||
# In[6]:
|
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize, sent_tokenize
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
from gensim.scripts.glove2word2vec import glove2word2vec
|
||||||
|
from gensim.models import KeyedVectors
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
import networkx as nx
|
||||||
|
import collections
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
import tqdm
|
||||||
|
from tqdm.notebook import tqdm_notebook
|
||||||
|
tqdm_notebook.pandas()
|
||||||
|
# to use tqdm in pandas use progress_apply instead of apply
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# Different code for Spanish and English vectors
|
||||||
|
# Extract word vectors
|
||||||
|
|
||||||
|
check_file(os.path.join(data_path,glove_file))
|
||||||
|
|
||||||
|
if stopwords_lang == 'english':
|
||||||
|
non_keyed_embs = os.path.join(data_path,glove_file)
|
||||||
|
keyed_embs = os.path.join(data_path,glove_file+'.vec')
|
||||||
|
if (not os.path.isfile(keyed_embs)) and (os.path.isfile(non_keyed_embs)):
|
||||||
|
glove2word2vec(non_keyed_embs, keyed_embs)
|
||||||
|
glove_file = glove_file+'.vec'
|
||||||
|
|
||||||
|
word_embeddings = {}
|
||||||
|
f = open(os.path.join(data_path,glove_file), encoding='utf-8')
|
||||||
|
for line in f:
|
||||||
|
values = line.split()
|
||||||
|
word = values[0]
|
||||||
|
coefs = np.asarray(values[1:], dtype='float32')
|
||||||
|
word_embeddings[word] = coefs
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Read the comments and join the comments belonging to the same proposal
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
check_file(os.path.join(data_path,inputjsonfile))
|
||||||
|
comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
||||||
|
comments_input_df = comments_input_df[comments_input_df['commentable_type'] == 'Budget::Investment']
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
|
||||||
|
if len(comments_input_df) == 0:
|
||||||
|
logging.info('No Participatory Budgeting comments found to summarise.')
|
||||||
|
with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_input_df = comments_input_df[[col_id]+[col_content]]
|
||||||
|
|
||||||
|
# Normalise characters
|
||||||
|
comments_input_df[col_content] = comments_input_df[col_content].apply(lambda x: normalize('NFKC',x))
|
||||||
|
|
||||||
|
comments_input_df = comments_input_df.sort_values(by=col_id)
|
||||||
|
comments_input_df.reset_index(drop=True,inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# Drop empty texts
|
||||||
|
|
||||||
|
empty_txt_ids = []
|
||||||
|
for idx,row in comments_input_df.iterrows():
|
||||||
|
if row['body'].strip() == '':
|
||||||
|
empty_txt_ids.append(idx)
|
||||||
|
|
||||||
|
comments_input_df = comments_input_df.drop(empty_txt_ids)
|
||||||
|
comments_input_df.reset_index(drop=True,inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df = pd.DataFrame()
|
||||||
|
|
||||||
|
temp_comments_joined = []
|
||||||
|
temp_comments_number = []
|
||||||
|
temp_proposal_id = []
|
||||||
|
for prop_id in sorted(list(set(comments_input_df[col_id].tolist()))):
|
||||||
|
temp_list = comments_input_df[comments_input_df[col_id] == prop_id][col_content].tolist()
|
||||||
|
temp_comments_joined.append('\n'.join(temp_list))
|
||||||
|
temp_comments_number.append(len(temp_list))
|
||||||
|
temp_proposal_id.append(prop_id)
|
||||||
|
|
||||||
|
comments_df['prop_id'] = temp_proposal_id
|
||||||
|
comments_df['comments_joined'] = temp_comments_joined
|
||||||
|
comments_df['comments_number'] = temp_comments_number
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# # Stats
|
||||||
|
# print(len(comments_df))
|
||||||
|
# print(len(comments_df[(comments_df['comments_number'] >= 0) & (comments_df['comments_number'] < 10)]))
|
||||||
|
# print(len(comments_df[(comments_df['comments_number'] >= 10) & (comments_df['comments_number'] < 50)]))
|
||||||
|
# print(len(comments_df[(comments_df['comments_number'] >= 50) & (comments_df['comments_number'] < 900)]))
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Make comments lowercase
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['comments_joined'] = comments_df['comments_joined'].apply(lambda x: x.lower())
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Split sentences
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def split_sentences(txt):
|
||||||
|
new_text_1 = sent_tokenize(txt,sent_token_lang)
|
||||||
|
#outputs [] if txt is ''; or made of ' ' or '\n'
|
||||||
|
|
||||||
|
new_text_2 = []
|
||||||
|
if new_text_1 != []:
|
||||||
|
for tok1 in new_text_1:
|
||||||
|
new_text_2 += tok1.split('\n')
|
||||||
|
#outputs [''] if txt is ''
|
||||||
|
new_text_2 = [tok.strip() for tok in new_text_2 if tok.strip() != '']
|
||||||
|
|
||||||
|
if new_text_2 == []:
|
||||||
|
new_text_2 = ['']
|
||||||
|
|
||||||
|
return new_text_2
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['comments_sentences'] = comments_df['comments_joined'].apply(split_sentences)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Calculate sentence embeddings
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# Includes some extra steps for Spanish
|
||||||
|
# List of stop words to be removed
|
||||||
|
stop_words = set(stopwords.words(stopwords_lang))
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
for word in stop_words:
|
||||||
|
stop_words = stop_words.union({re.sub(r"á","a",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"é","e",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"í","i",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"ó","o",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"ú","u",word)})
|
||||||
|
|
||||||
|
# additional terms removed when found as an independent character
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
|
||||||
|
else:
|
||||||
|
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
|
||||||
|
all_stop_words = stop_words.union(additional_stop_words)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def sentences_embeddings(sents):
|
||||||
|
sent_embs = []
|
||||||
|
|
||||||
|
for sent in sents:
|
||||||
|
words = set(word_tokenize(sent))
|
||||||
|
words = words-all_stop_words
|
||||||
|
if len(words) != 0:
|
||||||
|
emb = sum([word_embeddings.get(word, np.zeros(300)) for word in words])/(
|
||||||
|
len(words)+0.001)
|
||||||
|
else:
|
||||||
|
emb = np.zeros(300)
|
||||||
|
sent_embs.append(emb)
|
||||||
|
|
||||||
|
return sent_embs
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
if tqdm_notebook:
|
||||||
|
comments_df['comments_sentences_embeddings'] = comments_df[
|
||||||
|
'comments_sentences'].progress_apply(sentences_embeddings)
|
||||||
|
else:
|
||||||
|
comments_df['comments_sentences_embeddings'] = comments_df[
|
||||||
|
'comments_sentences'].apply(sentences_embeddings)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Calculate sentence scores
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def sentences_scores(sents, sent_embs):
|
||||||
|
|
||||||
|
# similarity matrix
|
||||||
|
if len(sent_embs) > 1:
|
||||||
|
stacked_sent_embs = np.stack(sent_embs)
|
||||||
|
sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
|
||||||
|
np.fill_diagonal(sim_mat, 0)
|
||||||
|
elif len(sent_embs) == 1:
|
||||||
|
sim_mat = np.array([[0.]])
|
||||||
|
else:
|
||||||
|
return collections.OrderedDict([('',1.0)])
|
||||||
|
|
||||||
|
nx_graph = nx.from_numpy_array(sim_mat)
|
||||||
|
|
||||||
|
try:
|
||||||
|
sentence_weight_temp = nx.pagerank(nx_graph)
|
||||||
|
except:
|
||||||
|
sentence_weight_temp = dict.fromkeys([x for x in range(len(sents))], 0)
|
||||||
|
|
||||||
|
sentence_weights = {sents[key]: value for key, value in sentence_weight_temp.items()}
|
||||||
|
|
||||||
|
sorted_sentence_weights = sorted(sentence_weights.items(), key=lambda elem: elem[1], reverse=True)
|
||||||
|
sentence_scores = collections.OrderedDict(sorted_sentence_weights)
|
||||||
|
|
||||||
|
return sentence_scores
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def plot_sentences_network(sents, sent_embs):
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# similarity matrix
|
||||||
|
if len(sent_embs) > 1:
|
||||||
|
stacked_sent_embs = np.stack(sent_embs)
|
||||||
|
sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
|
||||||
|
np.fill_diagonal(sim_mat, 0)
|
||||||
|
elif len(sent_embs) == 1:
|
||||||
|
sim_mat = np.array([[0.]])
|
||||||
|
else:
|
||||||
|
print('Nothing to plot')
|
||||||
|
return
|
||||||
|
|
||||||
|
nx_graph = nx.from_numpy_array(sim_mat)
|
||||||
|
|
||||||
|
plt.plot()
|
||||||
|
nx.draw(nx_graph, with_labels=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['comments_sentences_scores'] = comments_df[['comments_sentences','comments_sentences_embeddings']].progress_apply(
|
||||||
|
lambda row: sentences_scores(row['comments_sentences'],row['comments_sentences_embeddings']),axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Generate the summaries
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def comments_summary(sentence_weight, threshold_factor, *totalwords):
|
||||||
|
|
||||||
|
threshold = threshold_factor * np.mean(list(sentence_weight.values()))
|
||||||
|
|
||||||
|
sentence_counter = 0
|
||||||
|
comments_summary = ''
|
||||||
|
|
||||||
|
summary_num_words = 0
|
||||||
|
|
||||||
|
for sentence in sentence_weight:
|
||||||
|
if sentence_weight[sentence] >= (threshold):
|
||||||
|
if len(totalwords) == 0:
|
||||||
|
comments_summary += "\n- " + sentence
|
||||||
|
sentence_counter += 1
|
||||||
|
elif summary_num_words < totalwords[0]:
|
||||||
|
comments_summary += "\n- " + sentence
|
||||||
|
sentence_counter += 1
|
||||||
|
summary_num_words += len(sentence.split())
|
||||||
|
|
||||||
|
comments_summary = comments_summary.lstrip()
|
||||||
|
return comments_summary
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['comments_summary'] = comments_df['comments_sentences_scores'].apply(
|
||||||
|
lambda x: comments_summary(x,threshold_factor,max_size_of_summaries))
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# comments_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# for idx,row in comments_input_df[comments_input_df['commentable_id'] == 10].iterrows():
|
||||||
|
# print(row['body'])
|
||||||
|
# print('-------')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
#print(comments_df.loc[8,'comments_summary'])
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['commentable_type'] = ['Budget::Investment']*len(comments_df)
|
||||||
|
comments_summaries_df = comments_df[['prop_id','commentable_type','comments_summary']]
|
||||||
|
comments_summaries_df.reset_index(level=0, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_summaries_df = comments_summaries_df.rename(
|
||||||
|
columns={"index": "id", "prop_id": "commentable_id", "comments_summary": "body"})
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
#comments_summaries_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
comments_summaries_cols = ['id','commentable_id','commentable_type','body']
|
|
||||||
comments_summaries_df = pd.DataFrame(columns=comments_summaries_cols)
|
|
||||||
row = [0,0,'Budget::Investment','Summary']
|
|
||||||
comments_summaries_df = comments_summaries_df.append(dict(zip(comments_summaries_cols,row)), ignore_index=True)
|
|
||||||
comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False)
|
comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False)
|
||||||
|
comments_summaries_df.to_csv(os.path.join(data_path,comments_summaries_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
logging.info('Script executed correctly.')
|
||||||
|
|
||||||
|
|||||||
@@ -5,11 +5,63 @@
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Related Proposals and Tags - Dummy script
|
Related Proposals and Tags
|
||||||
|
|
||||||
|
This script generates for each proposal: a) Tags, b) List of related proposals.
|
||||||
|
Running time: Max 2 hours for 10.000 proposals.
|
||||||
|
Technique used: NNMF and Euclidean distance between proposals.
|
||||||
|
More info in: https://github.com/consul-ml/consul-ml
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def check_file(file_name):
|
||||||
|
if os.path.isfile(file_name):
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
logging.info('Missing file in Related Proposals and Tags: ' + str(file_name))
|
||||||
|
except NameError:
|
||||||
|
print('No logging')
|
||||||
|
with open(os.path.join(data_path,taggings_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
with open(os.path.join(data_path,tags_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
with open(os.path.join(data_path,related_props_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# Input file:
|
||||||
|
inputjsonfile = 'proposals.json'
|
||||||
|
col_id = 'id'
|
||||||
|
col_title = 'title'
|
||||||
|
cols_content = ['title','description','summary']
|
||||||
|
|
||||||
|
# Output files:
|
||||||
|
topics_tags_filename = 'ml_topics_tags_proposals.json'
|
||||||
|
topics_tags_filename_csv = 'ml_topics_tags_proposals.csv'
|
||||||
|
|
||||||
|
repr_prop_filename = 'ml_repr_proposals.json'
|
||||||
|
repr_prop_filename_csv = 'ml_repr_proposals.csv'
|
||||||
|
|
||||||
|
taggings_filename = 'ml_taggings_proposals.json'
|
||||||
|
taggings_filename_csv = 'ml_taggings_proposals.csv'
|
||||||
|
|
||||||
|
tags_filename = 'ml_tags_proposals.json'
|
||||||
|
tags_filename_csv = 'ml_tags_proposals.csv'
|
||||||
|
|
||||||
|
related_props_filename = 'ml_related_content_proposals.json'
|
||||||
|
related_props_filename_csv = 'ml_related_content_proposals.csv'
|
||||||
|
|
||||||
|
tqdm_notebook = True
|
||||||
|
|
||||||
|
|
||||||
# In[2]:
|
# In[2]:
|
||||||
|
|
||||||
|
|
||||||
@@ -17,70 +69,691 @@ data_path = '../data'
|
|||||||
config_file = 'proposals_related_content_and_tags_nmf.ini'
|
config_file = 'proposals_related_content_and_tags_nmf.ini'
|
||||||
logging_file ='proposals_related_content_and_tags_nmf.log'
|
logging_file ='proposals_related_content_and_tags_nmf.log'
|
||||||
|
|
||||||
|
# Read the configuration file
|
||||||
|
import os
|
||||||
|
import configparser
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
check_file(os.path.join(data_path,config_file))
|
||||||
|
config.read(os.path.join(data_path,config_file))
|
||||||
|
|
||||||
|
stanza_model_lang = config['PREPROCESSING']['stanza_model_lang']
|
||||||
|
stopwords_lang = config['PREPROCESSING']['stopwords_lang']
|
||||||
|
noun_lemmatisation = config['PREPROCESSING'].getboolean('noun_lemmatisation')
|
||||||
|
n_gram_min_count = config['PREPROCESSING'].getint('n_gram_min_count')
|
||||||
|
stanza_download = config['PREPROCESSING'].getboolean('stanza_download')
|
||||||
|
nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
|
||||||
|
|
||||||
|
numb_related_proposals = config['RELATED_PROPOSALS'].getint('numb_related_proposals')
|
||||||
|
|
||||||
|
numb_topics = config['TOPIC_MODELLING'].getint('numb_topics')
|
||||||
|
numb_topkeywords_pertopic = config['TOPIC_MODELLING'].getint('numb_topkeywords_pertopic')
|
||||||
|
n_top_represent_props = config['TOPIC_MODELLING'].getint('n_top_represent_props')
|
||||||
|
n_features = config['TOPIC_MODELLING'].getint('n_features')
|
||||||
|
min_df_val = config['TOPIC_MODELLING'].getfloat('min_df_val')
|
||||||
|
max_df_val = config['TOPIC_MODELLING'].getfloat('max_df_val')
|
||||||
|
|
||||||
|
logging_level = config['LOGGING']['logging_level']
|
||||||
|
|
||||||
|
|
||||||
# In[3]:
|
# In[3]:
|
||||||
|
|
||||||
|
|
||||||
# Input file:
|
related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)]
|
||||||
inputjsonfile = 'proposals.json'
|
|
||||||
|
|
||||||
# Output files:
|
repr_prop_cols = ['topic_id','proposal_id','title']
|
||||||
taggings_filename = 'ml_taggings_proposals.json'
|
tags_file_cols = ['id','name','taggings_count','kind']
|
||||||
tags_filename = 'ml_tags_proposals.json'
|
taggings_file_cols = ['tag_id','taggable_id','taggable_type']
|
||||||
related_props_filename = 'ml_related_content_proposals.json'
|
tag_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)]
|
||||||
|
|
||||||
|
tags_file_cols_count = 'taggings_count'
|
||||||
|
taggings_file_cols_id = 'tag_id'
|
||||||
|
|
||||||
|
|
||||||
# In[4]:
|
# In[4]:
|
||||||
|
|
||||||
|
|
||||||
import os
|
import logging
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
logging.basicConfig(filename=os.path.join(data_path,logging_file),
|
||||||
|
filemode='w',
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
level=logging_level)
|
||||||
|
#logging.info('message')
|
||||||
|
|
||||||
# ### Read the proposals
|
|
||||||
|
|
||||||
# In[5]:
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
# proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
import os
|
||||||
# col_id = 'id'
|
import re
|
||||||
# cols_content = ['title','description','summary']
|
import numpy as np
|
||||||
# proposals_input_df = proposals_input_df[[col_id]+cols_content]
|
import pandas as pd
|
||||||
|
from unicodedata import normalize
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
# ### Create file: Taggings. Each line is a Tag associated to a Proposal
|
|
||||||
|
|
||||||
# In[6]:
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
taggings_file_cols = ['tag_id','taggable_id','taggable_type']
|
import stanza
|
||||||
taggings_file_df = pd.DataFrame(columns=taggings_file_cols)
|
if stanza_download:
|
||||||
row = [0,1,'Proposal']
|
stanza.download(stanza_model_lang)
|
||||||
taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True)
|
|
||||||
taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False)
|
|
||||||
|
|
||||||
|
# IF NEEDED define 'pos_batch_size': 10000 in the next cell, config options.
|
||||||
|
config = {
|
||||||
|
'processors': 'tokenize,mwt,pos,lemma',
|
||||||
|
'lang': stanza_model_lang
|
||||||
|
}
|
||||||
|
#not using depparse
|
||||||
|
nlp = stanza.Pipeline(**config)
|
||||||
|
|
||||||
# ### Create file: Tags. List of Tags with the number of times they have been used
|
|
||||||
|
|
||||||
# In[7]:
|
# In[7]:
|
||||||
|
|
||||||
|
|
||||||
tags_file_cols = ['id','name','taggings_count','kind']
|
import tqdm
|
||||||
tags_file_df = pd.DataFrame(columns=tags_file_cols)
|
from tqdm.notebook import tqdm_notebook
|
||||||
row = [0,'tag',0,'']
|
tqdm_notebook.pandas()
|
||||||
tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True)
|
# to use tqdm in pandas use progress_apply instead of apply
|
||||||
tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False)
|
|
||||||
|
|
||||||
|
|
||||||
# ### Create file: List of related proposals
|
|
||||||
|
|
||||||
# In[8]:
|
# In[8]:
|
||||||
|
|
||||||
|
|
||||||
numb_related_proposals = 2
|
import nltk
|
||||||
related_props_cols = ['id']+['related'+str(num) for num in range(1,numb_related_proposals+1)]
|
if nltk_download:
|
||||||
related_props_df = pd.DataFrame(columns=related_props_cols)
|
nltk.download('stopwords')
|
||||||
row = [1]+['' for num in range(1,numb_related_proposals+1)]
|
nltk.download('punkt')
|
||||||
related_props_df = related_props_df.append(dict(zip(related_props_cols,row)), ignore_index=True)
|
|
||||||
related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False)
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize, sent_tokenize
|
||||||
|
|
||||||
|
|
||||||
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
|
import gensim
|
||||||
|
from gensim.models.phrases import Phrases, Phraser
|
||||||
|
|
||||||
|
|
||||||
|
# In[10]:
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.decomposition import NMF
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Read the proposals and join the content to use in the topic modelling
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
check_file(os.path.join(data_path,inputjsonfile))
|
||||||
|
proposals_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
||||||
|
proposals_input_df = proposals_input_df[[col_id]+cols_content]
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
|
||||||
|
if len(proposals_input_df) == 0:
|
||||||
|
logging.info('No Proposals found.')
|
||||||
|
with open(os.path.join(data_path,taggings_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
with open(os.path.join(data_path,tags_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
with open(os.path.join(data_path,related_props_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# In[11]:
|
||||||
|
|
||||||
|
|
||||||
|
# Normalise characters
|
||||||
|
for col in cols_content:
|
||||||
|
proposals_input_df[col] = proposals_input_df[col].apply(lambda x: normalize('NFKC',x))
|
||||||
|
|
||||||
|
proposals_input_df['joined_content'] = proposals_input_df[cols_content].agg('\n'.join, axis=1)
|
||||||
|
proposals_input_df = proposals_input_df.drop(columns=list(set(cols_content)-{col_title}))
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Lemmatise the content
|
||||||
|
|
||||||
|
# In[12]:
|
||||||
|
|
||||||
|
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = proposals_input_df['joined_content']
|
||||||
|
|
||||||
|
|
||||||
|
# In[13]:
|
||||||
|
|
||||||
|
|
||||||
|
# Using Stanza from Stanford NLP group
|
||||||
|
def content_processing_for_topicmodelling_1(txt):
|
||||||
|
|
||||||
|
# Delete html tags and urls
|
||||||
|
tmp_txt = re.sub("<[^<]+?>","",txt)
|
||||||
|
tmp_txt = re.sub(r"http[^\s]+?\s","",tmp_txt)
|
||||||
|
tmp_txt = re.sub(r"http[^\s]+?$","",tmp_txt)
|
||||||
|
tmp_txt = re.sub(r"www[^\s]+?\s","",tmp_txt)
|
||||||
|
tmp_txt = re.sub(r"www[^\s]+?$","",tmp_txt)
|
||||||
|
|
||||||
|
# Tokenise, lemmatise and select only the nouns
|
||||||
|
new_txt_tok = []
|
||||||
|
if len(re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ]","",tmp_txt).rstrip("\n")) != 0:
|
||||||
|
tmp_txt_nlp = nlp(tmp_txt)
|
||||||
|
|
||||||
|
for sent in tmp_txt_nlp.sentences:
|
||||||
|
for token in sent.words:
|
||||||
|
if noun_lemmatisation:
|
||||||
|
if token.upos == 'NOUN':
|
||||||
|
new_txt_tok.append(token.lemma)
|
||||||
|
else:
|
||||||
|
new_txt_tok.append(token.text)
|
||||||
|
|
||||||
|
return new_txt_tok
|
||||||
|
|
||||||
|
|
||||||
|
# In[14]:
|
||||||
|
|
||||||
|
|
||||||
|
if tqdm_notebook:
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
|
||||||
|
'joined_content_topicmodelling'].progress_apply(content_processing_for_topicmodelling_1)
|
||||||
|
else:
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
|
||||||
|
'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_1)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
# # Clean the data
|
||||||
|
|
||||||
|
# In[16]:
|
||||||
|
|
||||||
|
|
||||||
|
# Includes some extra steps for Spanish
|
||||||
|
# List of stop words to be removed
|
||||||
|
stop_words = set(stopwords.words(stopwords_lang))
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
for word in stop_words:
|
||||||
|
stop_words = stop_words.union({re.sub(r"á","a",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"é","e",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"í","i",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"ó","o",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"ú","u",word)})
|
||||||
|
|
||||||
|
# additional terms removed when found as an independent character
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
|
||||||
|
else:
|
||||||
|
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
|
||||||
|
all_stop_words = stop_words.union(additional_stop_words)
|
||||||
|
|
||||||
|
|
||||||
|
# In[17]:
|
||||||
|
|
||||||
|
|
||||||
|
def content_processing_for_topicmodelling_2(txt_tok):
|
||||||
|
new_text_tok = []
|
||||||
|
for word in txt_tok:
|
||||||
|
new_word = word.lower()
|
||||||
|
new_word = re.sub(r"[^a-zA-ZäÄëËïÏöÖüÜáéíóúáéíóúÁÉÍÓÚÂÊÎÔÛâêîôûàèìòùÀÈÌÒÙñÑ\s]","",new_word)
|
||||||
|
new_word = re.sub(r"[0-9]+","",new_word)
|
||||||
|
new_word = new_word.rstrip("\n")
|
||||||
|
if (len(new_word) != 0) and (new_word not in all_stop_words):
|
||||||
|
new_text_tok.append(new_word)
|
||||||
|
|
||||||
|
return new_text_tok
|
||||||
|
|
||||||
|
|
||||||
|
# In[18]:
|
||||||
|
|
||||||
|
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = proposals_input_df[
|
||||||
|
'joined_content_topicmodelling'].apply(content_processing_for_topicmodelling_2)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Detect n-grams
|
||||||
|
|
||||||
|
# In[19]:
|
||||||
|
|
||||||
|
|
||||||
|
txt_unigram = proposals_input_df['joined_content_topicmodelling'].tolist()
|
||||||
|
|
||||||
|
phrases_bigrams = Phrases(txt_unigram, min_count=n_gram_min_count)
|
||||||
|
txt_bigram = [phrases_bigrams[txt] for txt in txt_unigram]
|
||||||
|
txt_bigram_joined = [' '.join(txt) for txt in txt_bigram]
|
||||||
|
|
||||||
|
# may contain also cuadrigrams when joining 2 bigrams:
|
||||||
|
# phrases_trigrams = Phrases(txt_bigram, min_count=n_gram_min_count)
|
||||||
|
# txt_trigram = [phrases_trigrams[txt] for txt in txt_bigram]
|
||||||
|
# txt_trigram_joined = [' '.join(txt) for txt in txt_trigram]
|
||||||
|
|
||||||
|
proposals_input_df['joined_content_topicmodelling'] = txt_bigram_joined
|
||||||
|
# proposals_input_df['joined_content_topicmodelling'] = txt_trigram_joined
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Topic modelling (NMF)
|
||||||
|
|
||||||
|
# In[20]:
|
||||||
|
|
||||||
|
|
||||||
|
df_col_to_use = proposals_input_df['joined_content_topicmodelling']
|
||||||
|
|
||||||
|
# NUMBER OF TOPICS
|
||||||
|
n_components = numb_topics
|
||||||
|
# SELECT the TOP n_top_words WORDS for each topic
|
||||||
|
n_top_words = numb_topkeywords_pertopic
|
||||||
|
|
||||||
|
# Use tf-idf features for NMF
|
||||||
|
tfidf_vectorizer = TfidfVectorizer(max_df=max_df_val, min_df=min_df_val,
|
||||||
|
max_features=n_features)
|
||||||
|
|
||||||
|
tfidf = tfidf_vectorizer.fit_transform(df_col_to_use.tolist())
|
||||||
|
|
||||||
|
|
||||||
|
# In[21]:
|
||||||
|
|
||||||
|
|
||||||
|
# Includes some extra steps for Spanish
|
||||||
|
def cleaning_features(top_features):
|
||||||
|
clean_features = top_features.copy()
|
||||||
|
for feature in clean_features:
|
||||||
|
if feature+'s' in clean_features: clean_features[max(
|
||||||
|
clean_features.index(feature),clean_features.index(feature+'s'))] = ''
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
for feature in clean_features:
|
||||||
|
if feature+'es' in clean_features: clean_features[max(
|
||||||
|
clean_features.index(feature),clean_features.index(feature+'es'))] = ''
|
||||||
|
for feature in clean_features:
|
||||||
|
if feature+'r' in clean_features: clean_features[max(
|
||||||
|
clean_features.index(feature),clean_features.index(feature+'r'))] = ''
|
||||||
|
|
||||||
|
nosign_features = clean_features.copy()
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"á","a",fet)
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"é","e",fet)
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"í","i",fet)
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"ó","o",fet)
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
nosign_features[pos]=re.sub(r"ú","u",fet)
|
||||||
|
|
||||||
|
for pos,fet in enumerate(nosign_features):
|
||||||
|
if fet in nosign_features[pos+1:]:
|
||||||
|
clean_features[max(pos_2 for pos_2,fet_2 in enumerate(nosign_features) if fet_2 == fet)] = ''
|
||||||
|
|
||||||
|
return clean_features
|
||||||
|
|
||||||
|
|
||||||
|
# Fit the NMF model
|
||||||
|
nmf = NMF(n_components=n_components, random_state=1,
|
||||||
|
alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
|
||||||
|
|
||||||
|
# nmf.components_ is the H matrix
|
||||||
|
# W = nmf.fit_transform(tfidf)
|
||||||
|
|
||||||
|
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
|
||||||
|
|
||||||
|
# Size of the vocabulary and the nmf matrix
|
||||||
|
#print(len(tfidf_vectorizer.vocabulary_))
|
||||||
|
#print(len(tfidf_feature_names))
|
||||||
|
#nmf.components_.shape
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ### Create file: Repr_Prop. Most representative proposal for each topic
|
||||||
|
|
||||||
|
# In[22]:
|
||||||
|
|
||||||
|
|
||||||
|
W = nmf.fit_transform(tfidf)
|
||||||
|
#print(W.shape)
|
||||||
|
|
||||||
|
repr_prop_df = pd.DataFrame(columns=repr_prop_cols)
|
||||||
|
|
||||||
|
for topic_index in range(n_components):
|
||||||
|
top_indices = np.argsort( W[:,topic_index] )[::-1]
|
||||||
|
top_represent_proposals = []
|
||||||
|
for proposal_index in top_indices[0:n_top_represent_props]:
|
||||||
|
top_represent_proposals.append(proposal_index)
|
||||||
|
|
||||||
|
for prop_internal_index in top_represent_proposals:
|
||||||
|
row = [topic_index,
|
||||||
|
proposals_input_df.loc[int(prop_internal_index),'id'],
|
||||||
|
proposals_input_df.loc[int(prop_internal_index),'title']]
|
||||||
|
repr_prop_df = repr_prop_df.append(dict(zip(repr_prop_cols,row)), ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[23]:
|
||||||
|
|
||||||
|
|
||||||
|
repr_prop_df.to_json(os.path.join(data_path,repr_prop_filename),orient="records", force_ascii=False)
|
||||||
|
repr_prop_df.to_csv(os.path.join(data_path,repr_prop_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ### Create file: Topics_Tags. List of Topics with their top Tags
|
||||||
|
|
||||||
|
# In[24]:
|
||||||
|
|
||||||
|
|
||||||
|
topics_tags_df = pd.DataFrame(columns=['id']+tag_cols)
|
||||||
|
|
||||||
|
# nmf.components_ is the H matrix
|
||||||
|
|
||||||
|
for topic_idx, topic in enumerate(nmf.components_):
|
||||||
|
obj_temp = [tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
|
||||||
|
clean_obj_temp = cleaning_features(obj_temp)
|
||||||
|
clean_obj_temp.insert(0, str(topic_idx))
|
||||||
|
#print(clean_obj_temp)
|
||||||
|
topics_tags_df = topics_tags_df.append(dict(zip(['id']+tag_cols,clean_obj_temp)), ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[25]:
|
||||||
|
|
||||||
|
|
||||||
|
topics_tags_df.to_json(os.path.join(data_path,topics_tags_filename),orient="records", force_ascii=False)
|
||||||
|
topics_tags_df.to_csv(os.path.join(data_path,topics_tags_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ### Create file: Taggings. Each line is a Tag associated to a Proposal
|
||||||
|
|
||||||
|
# In[26]:
|
||||||
|
|
||||||
|
|
||||||
|
# Coefficients for following calculation
|
||||||
|
tag_coefs_cols = ['tag'+str(num) for num in range(1,numb_topkeywords_pertopic+1)]
|
||||||
|
topics_tags_coefs_df = pd.DataFrame(columns=['id']+tag_coefs_cols)
|
||||||
|
|
||||||
|
# nmf.components_ is the H matrix
|
||||||
|
|
||||||
|
for topic_idx, topic in enumerate(nmf.components_):
|
||||||
|
topics_tags_coefs_temp = []
|
||||||
|
topics_tags_coefs_temp.append(int(topic_idx))
|
||||||
|
for i in topic.argsort()[:-n_top_words - 1:-1]:
|
||||||
|
topics_tags_coefs_temp.append(topic[i])
|
||||||
|
topics_tags_coefs_df = topics_tags_coefs_df.append(dict(zip(['id']+tag_coefs_cols,
|
||||||
|
topics_tags_coefs_temp)), ignore_index=True)
|
||||||
|
|
||||||
|
for col in tag_cols:
|
||||||
|
for topic_idx,topic in enumerate(topics_tags_df[col].tolist()):
|
||||||
|
if topic == '':
|
||||||
|
topics_tags_coefs_df.loc[int(topic_idx),col] = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
# In[27]:
|
||||||
|
|
||||||
|
|
||||||
|
topics_tags_flat = []
|
||||||
|
for idx,topic in topics_tags_df.iterrows():
|
||||||
|
topics_tags_flat = topics_tags_flat + topics_tags_df.loc[idx,tag_cols].tolist()
|
||||||
|
|
||||||
|
topics_tags_coefs_flat = []
|
||||||
|
for idx,topic in topics_tags_coefs_df.iterrows():
|
||||||
|
topics_tags_coefs_flat = topics_tags_coefs_flat + topics_tags_coefs_df.loc[idx,tag_coefs_cols].tolist()
|
||||||
|
|
||||||
|
|
||||||
|
# In[28]:
|
||||||
|
|
||||||
|
|
||||||
|
taggings_file_df = pd.DataFrame(columns=taggings_file_cols)
|
||||||
|
|
||||||
|
for prop_idx,prop in tqdm.tqdm(enumerate(W),total=len(W)):
|
||||||
|
proposal_topics_temp = np.zeros((len(topics_tags_flat)))
|
||||||
|
cont = 0
|
||||||
|
for weight in prop:
|
||||||
|
for n in range(n_top_words):
|
||||||
|
proposal_topics_temp[cont] = weight
|
||||||
|
cont += 1
|
||||||
|
|
||||||
|
proposal_tags_temp = proposal_topics_temp*topics_tags_coefs_flat
|
||||||
|
|
||||||
|
# Adding the coefficients of same tags:
|
||||||
|
for numterm_a,term_a in enumerate(topics_tags_flat):
|
||||||
|
for numterm_b in reversed(range(numterm_a+1,len(topics_tags_flat))):
|
||||||
|
term_b = topics_tags_flat[numterm_b]
|
||||||
|
if (term_a == term_b):
|
||||||
|
proposal_tags_temp[numterm_a] = proposal_tags_temp[numterm_a] + proposal_tags_temp[numterm_b]
|
||||||
|
proposal_tags_temp[numterm_b] = 0
|
||||||
|
|
||||||
|
for i in proposal_tags_temp.argsort()[:-n_top_words - 1:-1]:
|
||||||
|
row = [i,proposals_input_df.loc[prop_idx,'id'],'Proposal']
|
||||||
|
taggings_file_df = taggings_file_df.append(dict(zip(taggings_file_cols,row)), ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ### Create file: Tags. List of Tags with the number of times they have been used
|
||||||
|
|
||||||
|
# In[29]:
|
||||||
|
|
||||||
|
|
||||||
|
tags_file_df = pd.DataFrame(columns=tags_file_cols)
|
||||||
|
|
||||||
|
for tag_id,tag in enumerate(topics_tags_flat):
|
||||||
|
row = [tag_id,tag,0,'']
|
||||||
|
tags_file_df = tags_file_df.append(dict(zip(tags_file_cols,row)), ignore_index=True)
|
||||||
|
|
||||||
|
for tag_id in taggings_file_df[taggings_file_cols_id].tolist():
|
||||||
|
tags_file_df.loc[tag_id,tags_file_cols_count] = tags_file_df.loc[tag_id,tags_file_cols_count]+1
|
||||||
|
|
||||||
|
|
||||||
|
# ### Deleting duplicate tags from files Tag and Taggings before saving them
|
||||||
|
|
||||||
|
# In[30]:
|
||||||
|
|
||||||
|
|
||||||
|
change_rows = []
|
||||||
|
repeated_ids = []
|
||||||
|
for idx1,row1 in tags_file_df.iterrows():
|
||||||
|
for idx2,row2 in tags_file_df.iterrows():
|
||||||
|
if (idx2 > idx1) and (idx2 not in repeated_ids) and (row1['name'] == row2['name']):
|
||||||
|
change_rows.append((idx1,idx2))
|
||||||
|
repeated_ids.append(idx2)
|
||||||
|
|
||||||
|
tags_file_df = tags_file_df.drop(repeated_ids)
|
||||||
|
|
||||||
|
|
||||||
|
# In[31]:
|
||||||
|
|
||||||
|
|
||||||
|
for c_row in change_rows:
|
||||||
|
taggings_file_df['tag_id'] = taggings_file_df['tag_id'].apply(lambda x: c_row[0] if x == c_row[1] else x)
|
||||||
|
|
||||||
|
|
||||||
|
# In[32]:
|
||||||
|
|
||||||
|
|
||||||
|
tags_file_df.to_json(os.path.join(data_path,tags_filename),orient="records", force_ascii=False)
|
||||||
|
tags_file_df.to_csv(os.path.join(data_path,tags_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[33]:
|
||||||
|
|
||||||
|
|
||||||
|
taggings_file_df.to_json(os.path.join(data_path,taggings_filename),orient="records", force_ascii=False)
|
||||||
|
taggings_file_df.to_csv(os.path.join(data_path,taggings_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[47]:
|
||||||
|
|
||||||
|
|
||||||
|
# proposals_input_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[48]:
|
||||||
|
|
||||||
|
|
||||||
|
# repr_prop_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[49]:
|
||||||
|
|
||||||
|
|
||||||
|
# topics_tags_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[50]:
|
||||||
|
|
||||||
|
|
||||||
|
# taggings_file_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[51]:
|
||||||
|
|
||||||
|
|
||||||
|
# tags_file_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # LIST OF RELATED PROPOSALS
|
||||||
|
|
||||||
|
# In[34]:
|
||||||
|
|
||||||
|
|
||||||
|
proposal_topics_coefs_cols = ['id','topic_coefs']
|
||||||
|
proposal_topics_coefs_df = pd.DataFrame(columns=proposal_topics_coefs_cols)
|
||||||
|
|
||||||
|
for prop_idx,prop in enumerate(W):
|
||||||
|
row = [proposals_input_df.loc[prop_idx,'id'],prop.copy()]
|
||||||
|
proposal_topics_coefs_df = proposal_topics_coefs_df.append(dict(zip(proposal_topics_coefs_cols,row)),
|
||||||
|
ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[35]:
|
||||||
|
|
||||||
|
|
||||||
|
related_props_df = pd.DataFrame(columns=related_props_cols)
|
||||||
|
|
||||||
|
for idx,row in tqdm.tqdm(proposal_topics_coefs_df.iterrows(),total=len(proposal_topics_coefs_df)):
|
||||||
|
prop_related_temp = []
|
||||||
|
prop_related_temp.append(int(row['id']))
|
||||||
|
vectora = row['topic_coefs']
|
||||||
|
distances = [np.linalg.norm(vectora-vectorb) for vectorb in proposal_topics_coefs_df['topic_coefs'].tolist()]
|
||||||
|
|
||||||
|
# the vector contains also the id of the initial proposal, thus numb_related_proposals+1
|
||||||
|
for i in np.array(distances).argsort()[0:numb_related_proposals+1]:
|
||||||
|
if distances[i] != 0.0:
|
||||||
|
prop_related_temp.append(int(proposals_input_df.loc[i,'id']))
|
||||||
|
|
||||||
|
# in case there are less related proposals than the max number
|
||||||
|
while len(prop_related_temp) < numb_related_proposals+1:
|
||||||
|
prop_related_temp.append('')
|
||||||
|
|
||||||
|
related_props_df = related_props_df.append(dict(zip(related_props_cols,prop_related_temp)), ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[36]:
|
||||||
|
|
||||||
|
|
||||||
|
related_props_df.to_json(os.path.join(data_path,related_props_filename),orient="records", force_ascii=False)
|
||||||
|
related_props_df.to_csv(os.path.join(data_path,related_props_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[45]:
|
||||||
|
|
||||||
|
|
||||||
|
#proposal_topics_coefs_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[46]:
|
||||||
|
|
||||||
|
|
||||||
|
#related_props_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[44]:
|
||||||
|
|
||||||
|
|
||||||
|
logging.info('Script executed correctly.')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -5,55 +5,545 @@
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Proposals comments summaries - Dummy script
|
Proposals comments summaries
|
||||||
|
|
||||||
|
This script generates for each proposal a summary of all its comments.
|
||||||
|
Running time: Max 1 hour for 10.000 proposals.
|
||||||
|
Technique used: GloVe embeddings and TextRank.
|
||||||
|
More info in: https://github.com/consul-ml/consul-ml
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# In[2]:
|
# In[2]:
|
||||||
|
|
||||||
|
|
||||||
data_path = '../data'
|
# DOWNLOAD THE GLOVE EMBEDDINGS, IN THE DATA FOLDER:
|
||||||
config_file = 'proposals_summary_comments_textrank.ini'
|
|
||||||
logging_file ='proposals_summary_comments_textrank.log'
|
# ENGLISH:
|
||||||
|
#!wget https://nlp.stanford.edu/data/glove.6B.zip
|
||||||
|
#!gunzip glove.6B.zip
|
||||||
|
|
||||||
|
# SPANISH:
|
||||||
|
#!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz
|
||||||
|
#!gunzip glove-sbwc*.gz
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def check_file(file_name):
|
||||||
|
if os.path.isfile(file_name):
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
logging.info('Missing file in Proposals comments summaries: ' + str(file_name))
|
||||||
|
except NameError:
|
||||||
|
print('No logging')
|
||||||
|
with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# Input file:
|
||||||
|
inputjsonfile = 'comments.json'
|
||||||
|
col_id = 'commentable_id'
|
||||||
|
col_content = 'body'
|
||||||
|
|
||||||
|
# Output files:
|
||||||
|
comments_summaries_filename = 'ml_comments_summaries_proposals.json'
|
||||||
|
comments_summaries_filename_csv = 'ml_comments_summaries_proposals.csv'
|
||||||
|
|
||||||
|
tqdm_notebook = True
|
||||||
|
|
||||||
|
|
||||||
# In[3]:
|
# In[3]:
|
||||||
|
|
||||||
|
|
||||||
# Input file:
|
data_path = '../data'
|
||||||
inputjsonfile = 'comments.json'
|
config_file = 'proposals_summary_comments_textrank.ini'
|
||||||
|
logging_file ='proposals_summary_comments_textrank.log'
|
||||||
# Output files:
|
|
||||||
comments_summaries_filename = 'ml_comments_summaries_proposals.json'
|
|
||||||
|
|
||||||
|
|
||||||
# In[4]:
|
|
||||||
|
|
||||||
|
|
||||||
|
# Read the configuration file
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import configparser
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
check_file(os.path.join(data_path,config_file))
|
||||||
|
config.read(os.path.join(data_path,config_file))
|
||||||
|
|
||||||
|
sent_token_lang = config['PREPROCESSING']['sent_token_lang']
|
||||||
|
stopwords_lang = config['PREPROCESSING']['stopwords_lang']
|
||||||
|
nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
glove_file = config['SUMMARISATION']['glove_file_es']
|
||||||
|
if stopwords_lang == 'english':
|
||||||
|
glove_file = config['SUMMARISATION']['glove_file_en']
|
||||||
|
threshold_factor = config['SUMMARISATION'].getfloat('threshold_factor')
|
||||||
|
max_size_of_summaries = config['SUMMARISATION'].getint('max_size_of_summaries')
|
||||||
|
|
||||||
|
logging_level = config['LOGGING']['logging_level']
|
||||||
|
|
||||||
# ### Read the comments
|
|
||||||
|
|
||||||
# In[5]:
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
# comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
import logging
|
||||||
# col_id = 'commentable_id'
|
|
||||||
# col_content = 'body'
|
|
||||||
# comments_input_df = comments_input_df[[col_id]+[col_content]]
|
|
||||||
|
|
||||||
|
logging.basicConfig(filename=os.path.join(data_path,logging_file),
|
||||||
|
filemode='w',
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
level=logging_level)
|
||||||
|
#logging.info('message')
|
||||||
|
|
||||||
# ### Create file. Comments summaries
|
|
||||||
|
|
||||||
# In[6]:
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
comments_summaries_cols = ['id','commentable_id','commentable_type','body']
|
import os
|
||||||
comments_summaries_df = pd.DataFrame(columns=comments_summaries_cols)
|
import pandas as pd
|
||||||
row = [0,0,'Proposal','Summary']
|
import numpy as np
|
||||||
comments_summaries_df = comments_summaries_df.append(dict(zip(comments_summaries_cols,row)), ignore_index=True)
|
import re
|
||||||
comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False)
|
from unicodedata import normalize
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
# In[7]:
|
||||||
|
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
if nltk_download:
|
||||||
|
nltk.download('stopwords')
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
|
||||||
|
# In[8]:
|
||||||
|
|
||||||
|
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize, sent_tokenize
|
||||||
|
|
||||||
|
|
||||||
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
|
from gensim.scripts.glove2word2vec import glove2word2vec
|
||||||
|
from gensim.models import KeyedVectors
|
||||||
|
|
||||||
|
|
||||||
|
# In[10]:
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
import networkx as nx
|
||||||
|
import collections
|
||||||
|
|
||||||
|
|
||||||
|
# In[11]:
|
||||||
|
|
||||||
|
|
||||||
|
import tqdm
|
||||||
|
from tqdm.notebook import tqdm_notebook
|
||||||
|
tqdm_notebook.pandas()
|
||||||
|
# to use tqdm in pandas use progress_apply instead of apply
|
||||||
|
|
||||||
|
|
||||||
|
# In[12]:
|
||||||
|
|
||||||
|
|
||||||
|
# Different code for Spanish and English vectors
|
||||||
|
# Extract word vectors
|
||||||
|
|
||||||
|
check_file(os.path.join(data_path,glove_file))
|
||||||
|
|
||||||
|
if stopwords_lang == 'english':
|
||||||
|
non_keyed_embs = os.path.join(data_path,glove_file)
|
||||||
|
keyed_embs = os.path.join(data_path,glove_file+'.vec')
|
||||||
|
if (not os.path.isfile(keyed_embs)) and (os.path.isfile(non_keyed_embs)):
|
||||||
|
glove2word2vec(non_keyed_embs, keyed_embs)
|
||||||
|
glove_file = glove_file+'.vec'
|
||||||
|
|
||||||
|
word_embeddings = {}
|
||||||
|
f = open(os.path.join(data_path,glove_file), encoding='utf-8')
|
||||||
|
for line in f:
|
||||||
|
values = line.split()
|
||||||
|
word = values[0]
|
||||||
|
coefs = np.asarray(values[1:], dtype='float32')
|
||||||
|
word_embeddings[word] = coefs
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Read the comments and join the comments belonging to the same proposal
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
check_file(os.path.join(data_path,inputjsonfile))
|
||||||
|
comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
||||||
|
comments_input_df = comments_input_df[comments_input_df['commentable_type'] == 'Proposal']
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
|
||||||
|
if len(comments_input_df) == 0:
|
||||||
|
logging.info('No Proposals comments found to summarise.')
|
||||||
|
with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
|
||||||
|
file.write('[]')
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
# In[13]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_input_df = comments_input_df[[col_id]+[col_content]]
|
||||||
|
|
||||||
|
# Normalise characters
|
||||||
|
comments_input_df[col_content] = comments_input_df[col_content].apply(lambda x: normalize('NFKC',x))
|
||||||
|
|
||||||
|
comments_input_df = comments_input_df.sort_values(by=col_id)
|
||||||
|
comments_input_df.reset_index(drop=True,inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[14]:
|
||||||
|
|
||||||
|
|
||||||
|
# Drop empty texts
|
||||||
|
|
||||||
|
empty_txt_ids = []
|
||||||
|
for idx,row in comments_input_df.iterrows():
|
||||||
|
if row['body'].strip() == '':
|
||||||
|
empty_txt_ids.append(idx)
|
||||||
|
|
||||||
|
comments_input_df = comments_input_df.drop(empty_txt_ids)
|
||||||
|
comments_input_df.reset_index(drop=True,inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[15]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df = pd.DataFrame()
|
||||||
|
|
||||||
|
temp_comments_joined = []
|
||||||
|
temp_comments_number = []
|
||||||
|
temp_proposal_id = []
|
||||||
|
for prop_id in sorted(list(set(comments_input_df[col_id].tolist()))):
|
||||||
|
temp_list = comments_input_df[comments_input_df[col_id] == prop_id][col_content].tolist()
|
||||||
|
temp_comments_joined.append('\n'.join(temp_list))
|
||||||
|
temp_comments_number.append(len(temp_list))
|
||||||
|
temp_proposal_id.append(prop_id)
|
||||||
|
|
||||||
|
comments_df['prop_id'] = temp_proposal_id
|
||||||
|
comments_df['comments_joined'] = temp_comments_joined
|
||||||
|
comments_df['comments_number'] = temp_comments_number
|
||||||
|
|
||||||
|
|
||||||
|
# In[16]:
|
||||||
|
|
||||||
|
|
||||||
|
# # Stats
|
||||||
|
# print(len(comments_df))
|
||||||
|
# print(len(comments_df[(comments_df['comments_number'] >= 0) & (comments_df['comments_number'] < 10)]))
|
||||||
|
# print(len(comments_df[(comments_df['comments_number'] >= 10) & (comments_df['comments_number'] < 50)]))
|
||||||
|
# print(len(comments_df[(comments_df['comments_number'] >= 50) & (comments_df['comments_number'] < 900)]))
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Make comments lowercase
|
||||||
|
|
||||||
|
# In[17]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['comments_joined'] = comments_df['comments_joined'].apply(lambda x: x.lower())
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Split sentences
|
||||||
|
|
||||||
|
# In[18]:
|
||||||
|
|
||||||
|
|
||||||
|
def split_sentences(txt):
|
||||||
|
new_text_1 = sent_tokenize(txt,sent_token_lang)
|
||||||
|
#outputs [] if txt is ''; or made of ' ' or '\n'
|
||||||
|
|
||||||
|
new_text_2 = []
|
||||||
|
if new_text_1 != []:
|
||||||
|
for tok1 in new_text_1:
|
||||||
|
new_text_2 += tok1.split('\n')
|
||||||
|
#outputs [''] if txt is ''
|
||||||
|
new_text_2 = [tok.strip() for tok in new_text_2 if tok.strip() != '']
|
||||||
|
|
||||||
|
if new_text_2 == []:
|
||||||
|
new_text_2 = ['']
|
||||||
|
|
||||||
|
return new_text_2
|
||||||
|
|
||||||
|
|
||||||
|
# In[19]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['comments_sentences'] = comments_df['comments_joined'].apply(split_sentences)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Calculate sentence embeddings
|
||||||
|
|
||||||
|
# In[20]:
|
||||||
|
|
||||||
|
|
||||||
|
# Includes some extra steps for Spanish
|
||||||
|
# List of stop words to be removed
|
||||||
|
stop_words = set(stopwords.words(stopwords_lang))
|
||||||
|
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
for word in stop_words:
|
||||||
|
stop_words = stop_words.union({re.sub(r"á","a",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"é","e",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"í","i",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"ó","o",word)})
|
||||||
|
stop_words = stop_words.union({re.sub(r"ú","u",word)})
|
||||||
|
|
||||||
|
# additional terms removed when found as an independent character
|
||||||
|
if stopwords_lang == 'spanish':
|
||||||
|
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
|
||||||
|
else:
|
||||||
|
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
|
||||||
|
all_stop_words = stop_words.union(additional_stop_words)
|
||||||
|
|
||||||
|
|
||||||
|
# In[21]:
|
||||||
|
|
||||||
|
|
||||||
|
def sentences_embeddings(sents):
|
||||||
|
sent_embs = []
|
||||||
|
|
||||||
|
for sent in sents:
|
||||||
|
words = set(word_tokenize(sent))
|
||||||
|
words = words-all_stop_words
|
||||||
|
if len(words) != 0:
|
||||||
|
emb = sum([word_embeddings.get(word, np.zeros(300)) for word in words])/(
|
||||||
|
len(words)+0.001)
|
||||||
|
else:
|
||||||
|
emb = np.zeros(300)
|
||||||
|
sent_embs.append(emb)
|
||||||
|
|
||||||
|
return sent_embs
|
||||||
|
|
||||||
|
|
||||||
|
# In[22]:
|
||||||
|
|
||||||
|
|
||||||
|
if tqdm_notebook:
|
||||||
|
comments_df['comments_sentences_embeddings'] = comments_df[
|
||||||
|
'comments_sentences'].progress_apply(sentences_embeddings)
|
||||||
|
else:
|
||||||
|
comments_df['comments_sentences_embeddings'] = comments_df[
|
||||||
|
'comments_sentences'].apply(sentences_embeddings)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Calculate sentence scores
|
||||||
|
|
||||||
|
# In[23]:
|
||||||
|
|
||||||
|
|
||||||
|
def sentences_scores(sents, sent_embs):
|
||||||
|
|
||||||
|
# similarity matrix
|
||||||
|
if len(sent_embs) > 1:
|
||||||
|
stacked_sent_embs = np.stack(sent_embs)
|
||||||
|
sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
|
||||||
|
np.fill_diagonal(sim_mat, 0)
|
||||||
|
elif len(sent_embs) == 1:
|
||||||
|
sim_mat = np.array([[0.]])
|
||||||
|
else:
|
||||||
|
return collections.OrderedDict([('',1.0)])
|
||||||
|
|
||||||
|
nx_graph = nx.from_numpy_array(sim_mat)
|
||||||
|
|
||||||
|
try:
|
||||||
|
sentence_weight_temp = nx.pagerank(nx_graph)
|
||||||
|
except:
|
||||||
|
sentence_weight_temp = dict.fromkeys([x for x in range(len(sents))], 0)
|
||||||
|
|
||||||
|
sentence_weights = {sents[key]: value for key, value in sentence_weight_temp.items()}
|
||||||
|
|
||||||
|
sorted_sentence_weights = sorted(sentence_weights.items(), key=lambda elem: elem[1], reverse=True)
|
||||||
|
sentence_scores = collections.OrderedDict(sorted_sentence_weights)
|
||||||
|
|
||||||
|
return sentence_scores
|
||||||
|
|
||||||
|
|
||||||
|
# In[24]:
|
||||||
|
|
||||||
|
|
||||||
|
def plot_sentences_network(sents, sent_embs):
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# similarity matrix
|
||||||
|
if len(sent_embs) > 1:
|
||||||
|
stacked_sent_embs = np.stack(sent_embs)
|
||||||
|
sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
|
||||||
|
np.fill_diagonal(sim_mat, 0)
|
||||||
|
elif len(sent_embs) == 1:
|
||||||
|
sim_mat = np.array([[0.]])
|
||||||
|
else:
|
||||||
|
print('Nothing to plot')
|
||||||
|
return
|
||||||
|
|
||||||
|
nx_graph = nx.from_numpy_array(sim_mat)
|
||||||
|
|
||||||
|
plt.plot()
|
||||||
|
nx.draw(nx_graph, with_labels=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[25]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['comments_sentences_scores'] = comments_df[['comments_sentences','comments_sentences_embeddings']].progress_apply(
|
||||||
|
lambda row: sentences_scores(row['comments_sentences'],row['comments_sentences_embeddings']),axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Generate the summaries
|
||||||
|
|
||||||
|
# In[26]:
|
||||||
|
|
||||||
|
|
||||||
|
def comments_summary(sentence_weight, threshold_factor, *totalwords):
|
||||||
|
|
||||||
|
threshold = threshold_factor * np.mean(list(sentence_weight.values()))
|
||||||
|
|
||||||
|
sentence_counter = 0
|
||||||
|
comments_summary = ''
|
||||||
|
|
||||||
|
summary_num_words = 0
|
||||||
|
|
||||||
|
for sentence in sentence_weight:
|
||||||
|
if sentence_weight[sentence] >= (threshold):
|
||||||
|
if len(totalwords) == 0:
|
||||||
|
comments_summary += "\n- " + sentence
|
||||||
|
sentence_counter += 1
|
||||||
|
elif summary_num_words < totalwords[0]:
|
||||||
|
comments_summary += "\n- " + sentence
|
||||||
|
sentence_counter += 1
|
||||||
|
summary_num_words += len(sentence.split())
|
||||||
|
|
||||||
|
comments_summary = comments_summary.lstrip()
|
||||||
|
return comments_summary
|
||||||
|
|
||||||
|
|
||||||
|
# In[27]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['comments_summary'] = comments_df['comments_sentences_scores'].apply(
|
||||||
|
lambda x: comments_summary(x,threshold_factor,max_size_of_summaries))
|
||||||
|
|
||||||
|
|
||||||
|
# In[28]:
|
||||||
|
|
||||||
|
|
||||||
|
# comments_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[29]:
|
||||||
|
|
||||||
|
|
||||||
|
# for idx,row in comments_input_df[comments_input_df['commentable_id'] == 10].iterrows():
|
||||||
|
# print(row['body'])
|
||||||
|
# print('-------')
|
||||||
|
|
||||||
|
|
||||||
|
# In[30]:
|
||||||
|
|
||||||
|
|
||||||
|
#print(comments_df.loc[8,'comments_summary'])
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[31]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_df['commentable_type'] = ['Proposal']*len(comments_df)
|
||||||
|
comments_summaries_df = comments_df[['prop_id','commentable_type','comments_summary']]
|
||||||
|
comments_summaries_df.reset_index(level=0, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[32]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_summaries_df = comments_summaries_df.rename(
|
||||||
|
columns={"index": "id", "prop_id": "commentable_id", "comments_summary": "body"})
|
||||||
|
|
||||||
|
|
||||||
|
# In[33]:
|
||||||
|
|
||||||
|
|
||||||
|
#comments_summaries_df
|
||||||
|
|
||||||
|
|
||||||
|
# In[34]:
|
||||||
|
|
||||||
|
|
||||||
|
comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False)
|
||||||
|
comments_summaries_df.to_csv(os.path.join(data_path,comments_summaries_filename_csv), index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[35]:
|
||||||
|
|
||||||
|
|
||||||
|
logging.info('Script executed correctly.')
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user