Update machine learning scripts with NNMF and TextRank-GloVe techniques
This commit is contained in:
@@ -1,59 +1,549 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[1]:
|
||||
# In[ ]:
|
||||
|
||||
|
||||
"""
|
||||
Participatory Budgeting comments summaries - Dummy script
|
||||
Participatory Budgeting comments summaries
|
||||
|
||||
This script generates for each budget project a summary of all its comments.
|
||||
Running time: Max 1 hour for 10.000 proposals.
|
||||
Technique used: GloVe embeddings and TextRank.
|
||||
More info in: https://github.com/consul-ml/consul-ml
|
||||
"""
|
||||
|
||||
|
||||
# In[2]:
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# DOWNLOAD THE GLOVE EMBEDDINGS, IN THE DATA FOLDER:
|
||||
|
||||
# ENGLISH:
|
||||
#!wget https://nlp.stanford.edu/data/glove.6B.zip
|
||||
#!gunzip glove.6B.zip
|
||||
|
||||
# SPANISH:
|
||||
#!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz
|
||||
#!gunzip glove-sbwc*.gz
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def check_file(file_name):
|
||||
if os.path.isfile(file_name):
|
||||
return
|
||||
else:
|
||||
try:
|
||||
logging.info('Missing file in Participatory Budgeting comments summaries: ' + str(file_name))
|
||||
except NameError:
|
||||
print('No logging')
|
||||
with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
|
||||
file.write('[]')
|
||||
os._exit(0)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# Input file:
|
||||
inputjsonfile = 'comments.json'
|
||||
col_id = 'commentable_id'
|
||||
col_content = 'body'
|
||||
|
||||
# Output files:
|
||||
comments_summaries_filename = 'ml_comments_summaries_budgets.json'
|
||||
comments_summaries_filename_csv = 'ml_comments_summaries_budgets.csv'
|
||||
|
||||
tqdm_notebook = True
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
data_path = '../data'
|
||||
config_file = 'budgets_summary_comments_textrank.ini'
|
||||
logging_file ='budgets_summary_comments_textrank.log'
|
||||
|
||||
# Read the configuration file
|
||||
import os
|
||||
import configparser
|
||||
config = configparser.ConfigParser()
|
||||
check_file(os.path.join(data_path,config_file))
|
||||
config.read(os.path.join(data_path,config_file))
|
||||
|
||||
# In[3]:
|
||||
sent_token_lang = config['PREPROCESSING']['sent_token_lang']
|
||||
stopwords_lang = config['PREPROCESSING']['stopwords_lang']
|
||||
nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
|
||||
|
||||
if stopwords_lang == 'spanish':
|
||||
glove_file = config['SUMMARISATION']['glove_file_es']
|
||||
if stopwords_lang == 'english':
|
||||
glove_file = config['SUMMARISATION']['glove_file_en']
|
||||
threshold_factor = config['SUMMARISATION'].getfloat('threshold_factor')
|
||||
max_size_of_summaries = config['SUMMARISATION'].getint('max_size_of_summaries')
|
||||
|
||||
logging_level = config['LOGGING']['logging_level']
|
||||
|
||||
|
||||
# Input file:
|
||||
inputjsonfile = 'comments.json'
|
||||
|
||||
# Output files:
|
||||
comments_summaries_filename = 'ml_comments_summaries_budgets.json'
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# In[4]:
|
||||
import logging
|
||||
|
||||
logging.basicConfig(filename=os.path.join(data_path,logging_file),
|
||||
filemode='w',
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
level=logging_level)
|
||||
#logging.info('message')
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
from unicodedata import normalize
|
||||
import sys
|
||||
|
||||
|
||||
# ### Read the comments
|
||||
|
||||
# In[5]:
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
||||
# col_id = 'commentable_id'
|
||||
# col_content = 'body'
|
||||
# comments_input_df = comments_input_df[[col_id]+[col_content]]
|
||||
import nltk
|
||||
if nltk_download:
|
||||
nltk.download('stopwords')
|
||||
nltk.download('punkt')
|
||||
|
||||
|
||||
# ### Create file. Comments summaries
|
||||
|
||||
# In[6]:
|
||||
# In[ ]:
|
||||
|
||||
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import PorterStemmer
|
||||
from nltk.tokenize import word_tokenize, sent_tokenize
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
from gensim.scripts.glove2word2vec import glove2word2vec
|
||||
from gensim.models import KeyedVectors
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
import networkx as nx
|
||||
import collections
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
import tqdm
|
||||
from tqdm.notebook import tqdm_notebook
|
||||
tqdm_notebook.pandas()
|
||||
# to use tqdm in pandas use progress_apply instead of apply
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# Different code for Spanish and English vectors
|
||||
# Extract word vectors
|
||||
|
||||
check_file(os.path.join(data_path,glove_file))
|
||||
|
||||
if stopwords_lang == 'english':
|
||||
non_keyed_embs = os.path.join(data_path,glove_file)
|
||||
keyed_embs = os.path.join(data_path,glove_file+'.vec')
|
||||
if (not os.path.isfile(keyed_embs)) and (os.path.isfile(non_keyed_embs)):
|
||||
glove2word2vec(non_keyed_embs, keyed_embs)
|
||||
glove_file = glove_file+'.vec'
|
||||
|
||||
word_embeddings = {}
|
||||
f = open(os.path.join(data_path,glove_file), encoding='utf-8')
|
||||
for line in f:
|
||||
values = line.split()
|
||||
word = values[0]
|
||||
coefs = np.asarray(values[1:], dtype='float32')
|
||||
word_embeddings[word] = coefs
|
||||
f.close()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # Read the comments and join the comments belonging to the same proposal
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
check_file(os.path.join(data_path,inputjsonfile))
|
||||
comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
|
||||
comments_input_df = comments_input_df[comments_input_df['commentable_type'] == 'Budget::Investment']
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
|
||||
if len(comments_input_df) == 0:
|
||||
logging.info('No Participatory Budgeting comments found to summarise.')
|
||||
with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
|
||||
file.write('[]')
|
||||
os._exit(0)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
comments_input_df = comments_input_df[[col_id]+[col_content]]
|
||||
|
||||
# Normalise characters
|
||||
comments_input_df[col_content] = comments_input_df[col_content].apply(lambda x: normalize('NFKC',x))
|
||||
|
||||
comments_input_df = comments_input_df.sort_values(by=col_id)
|
||||
comments_input_df.reset_index(drop=True,inplace=True)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# Drop empty texts
|
||||
|
||||
empty_txt_ids = []
|
||||
for idx,row in comments_input_df.iterrows():
|
||||
if row['body'].strip() == '':
|
||||
empty_txt_ids.append(idx)
|
||||
|
||||
comments_input_df = comments_input_df.drop(empty_txt_ids)
|
||||
comments_input_df.reset_index(drop=True,inplace=True)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
comments_df = pd.DataFrame()
|
||||
|
||||
temp_comments_joined = []
|
||||
temp_comments_number = []
|
||||
temp_proposal_id = []
|
||||
for prop_id in sorted(list(set(comments_input_df[col_id].tolist()))):
|
||||
temp_list = comments_input_df[comments_input_df[col_id] == prop_id][col_content].tolist()
|
||||
temp_comments_joined.append('\n'.join(temp_list))
|
||||
temp_comments_number.append(len(temp_list))
|
||||
temp_proposal_id.append(prop_id)
|
||||
|
||||
comments_df['prop_id'] = temp_proposal_id
|
||||
comments_df['comments_joined'] = temp_comments_joined
|
||||
comments_df['comments_number'] = temp_comments_number
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# # Stats
|
||||
# print(len(comments_df))
|
||||
# print(len(comments_df[(comments_df['comments_number'] >= 0) & (comments_df['comments_number'] < 10)]))
|
||||
# print(len(comments_df[(comments_df['comments_number'] >= 10) & (comments_df['comments_number'] < 50)]))
|
||||
# print(len(comments_df[(comments_df['comments_number'] >= 50) & (comments_df['comments_number'] < 900)]))
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # Make comments lowercase
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
comments_df['comments_joined'] = comments_df['comments_joined'].apply(lambda x: x.lower())
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # Split sentences
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def split_sentences(txt):
|
||||
new_text_1 = sent_tokenize(txt,sent_token_lang)
|
||||
#outputs [] if txt is ''; or made of ' ' or '\n'
|
||||
|
||||
new_text_2 = []
|
||||
if new_text_1 != []:
|
||||
for tok1 in new_text_1:
|
||||
new_text_2 += tok1.split('\n')
|
||||
#outputs [''] if txt is ''
|
||||
new_text_2 = [tok.strip() for tok in new_text_2 if tok.strip() != '']
|
||||
|
||||
if new_text_2 == []:
|
||||
new_text_2 = ['']
|
||||
|
||||
return new_text_2
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
comments_df['comments_sentences'] = comments_df['comments_joined'].apply(split_sentences)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # Calculate sentence embeddings
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# Includes some extra steps for Spanish
|
||||
# List of stop words to be removed
|
||||
stop_words = set(stopwords.words(stopwords_lang))
|
||||
|
||||
if stopwords_lang == 'spanish':
|
||||
for word in stop_words:
|
||||
stop_words = stop_words.union({re.sub(r"á","a",word)})
|
||||
stop_words = stop_words.union({re.sub(r"é","e",word)})
|
||||
stop_words = stop_words.union({re.sub(r"í","i",word)})
|
||||
stop_words = stop_words.union({re.sub(r"ó","o",word)})
|
||||
stop_words = stop_words.union({re.sub(r"ú","u",word)})
|
||||
|
||||
# additional terms removed when found as an independent character
|
||||
if stopwords_lang == 'spanish':
|
||||
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
|
||||
else:
|
||||
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
|
||||
all_stop_words = stop_words.union(additional_stop_words)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def sentences_embeddings(sents):
|
||||
sent_embs = []
|
||||
|
||||
for sent in sents:
|
||||
words = set(word_tokenize(sent))
|
||||
words = words-all_stop_words
|
||||
if len(words) != 0:
|
||||
emb = sum([word_embeddings.get(word, np.zeros(300)) for word in words])/(
|
||||
len(words)+0.001)
|
||||
else:
|
||||
emb = np.zeros(300)
|
||||
sent_embs.append(emb)
|
||||
|
||||
return sent_embs
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
if tqdm_notebook:
|
||||
comments_df['comments_sentences_embeddings'] = comments_df[
|
||||
'comments_sentences'].progress_apply(sentences_embeddings)
|
||||
else:
|
||||
comments_df['comments_sentences_embeddings'] = comments_df[
|
||||
'comments_sentences'].apply(sentences_embeddings)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # Calculate sentence scores
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def sentences_scores(sents, sent_embs):
|
||||
|
||||
# similarity matrix
|
||||
if len(sent_embs) > 1:
|
||||
stacked_sent_embs = np.stack(sent_embs)
|
||||
sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
|
||||
np.fill_diagonal(sim_mat, 0)
|
||||
elif len(sent_embs) == 1:
|
||||
sim_mat = np.array([[0.]])
|
||||
else:
|
||||
return collections.OrderedDict([('',1.0)])
|
||||
|
||||
nx_graph = nx.from_numpy_array(sim_mat)
|
||||
|
||||
try:
|
||||
sentence_weight_temp = nx.pagerank(nx_graph)
|
||||
except:
|
||||
sentence_weight_temp = dict.fromkeys([x for x in range(len(sents))], 0)
|
||||
|
||||
sentence_weights = {sents[key]: value for key, value in sentence_weight_temp.items()}
|
||||
|
||||
sorted_sentence_weights = sorted(sentence_weights.items(), key=lambda elem: elem[1], reverse=True)
|
||||
sentence_scores = collections.OrderedDict(sorted_sentence_weights)
|
||||
|
||||
return sentence_scores
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def plot_sentences_network(sents, sent_embs):
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# similarity matrix
|
||||
if len(sent_embs) > 1:
|
||||
stacked_sent_embs = np.stack(sent_embs)
|
||||
sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
|
||||
np.fill_diagonal(sim_mat, 0)
|
||||
elif len(sent_embs) == 1:
|
||||
sim_mat = np.array([[0.]])
|
||||
else:
|
||||
print('Nothing to plot')
|
||||
return
|
||||
|
||||
nx_graph = nx.from_numpy_array(sim_mat)
|
||||
|
||||
plt.plot()
|
||||
nx.draw(nx_graph, with_labels=True)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
comments_df['comments_sentences_scores'] = comments_df[['comments_sentences','comments_sentences_embeddings']].progress_apply(
|
||||
lambda row: sentences_scores(row['comments_sentences'],row['comments_sentences_embeddings']),axis=1)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # Generate the summaries
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def comments_summary(sentence_weight, threshold_factor, *totalwords):
|
||||
|
||||
threshold = threshold_factor * np.mean(list(sentence_weight.values()))
|
||||
|
||||
sentence_counter = 0
|
||||
comments_summary = ''
|
||||
|
||||
summary_num_words = 0
|
||||
|
||||
for sentence in sentence_weight:
|
||||
if sentence_weight[sentence] >= (threshold):
|
||||
if len(totalwords) == 0:
|
||||
comments_summary += "\n- " + sentence
|
||||
sentence_counter += 1
|
||||
elif summary_num_words < totalwords[0]:
|
||||
comments_summary += "\n- " + sentence
|
||||
sentence_counter += 1
|
||||
summary_num_words += len(sentence.split())
|
||||
|
||||
comments_summary = comments_summary.lstrip()
|
||||
return comments_summary
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
comments_df['comments_summary'] = comments_df['comments_sentences_scores'].apply(
|
||||
lambda x: comments_summary(x,threshold_factor,max_size_of_summaries))
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# comments_df
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# for idx,row in comments_input_df[comments_input_df['commentable_id'] == 10].iterrows():
|
||||
# print(row['body'])
|
||||
# print('-------')
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
#print(comments_df.loc[8,'comments_summary'])
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
comments_df['commentable_type'] = ['Budget::Investment']*len(comments_df)
|
||||
comments_summaries_df = comments_df[['prop_id','commentable_type','comments_summary']]
|
||||
comments_summaries_df.reset_index(level=0, inplace=True)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
comments_summaries_df = comments_summaries_df.rename(
|
||||
columns={"index": "id", "prop_id": "commentable_id", "comments_summary": "body"})
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
#comments_summaries_df
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
comments_summaries_cols = ['id','commentable_id','commentable_type','body']
|
||||
comments_summaries_df = pd.DataFrame(columns=comments_summaries_cols)
|
||||
row = [0,0,'Budget::Investment','Summary']
|
||||
comments_summaries_df = comments_summaries_df.append(dict(zip(comments_summaries_cols,row)), ignore_index=True)
|
||||
comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False)
|
||||
comments_summaries_df.to_csv(os.path.join(data_path,comments_summaries_filename_csv), index=False)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
logging.info('Script executed correctly.')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user