Files
nairobi/public/machine_learning/scripts/proposals_summary_comments_textrank.py
Javi Martín 2f312bf474 Use a different machine learning folder per tenant
We're using the "tenants" subfolder for consistency with the folder
structure we use in ActiveStorage and because some CONSUL installations
might have folders inside the `data` folder which might conflict with
the folders created by tenants.

Note that the Python scripts have a lot of duplication, meaning we need
to change all of them. I'm not refactoring them because I'm not familiar
enough with these scripts (or with Python, for that matter).

Also note that the scripts folder is still shared by all tenants,
meaning it isn't possible to have different scripts for different
tenants. I'm not sure how this situation should be handled; again, I'm
not familiar enough with this feature.
2022-11-11 01:40:04 +01:00

553 lines
12 KiB
Python

#!/usr/bin/env python
# coding: utf-8
# In[1]:
"""
Proposals comments summaries
This script generates for each proposal a summary of all its comments.
Running time: Max 1 hour for 10.000 proposals.
Technique used: GloVe embeddings and TextRank.
More info in: https://github.com/consul-ml/consul-ml
"""
# In[2]:
# DOWNLOAD THE GLOVE EMBEDDINGS, IN THE DATA FOLDER:
# ENGLISH:
#!wget https://nlp.stanford.edu/data/glove.6B.zip
#!gunzip glove.6B.zip
# SPANISH:
#!wget http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz
#!gunzip glove-sbwc*.gz
# In[ ]:
def check_file(file_name):
if os.path.isfile(file_name):
return
else:
try:
logging.info('Missing file in Proposals comments summaries: ' + str(file_name))
except NameError:
print('No logging')
with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
file.write('[]')
os._exit(0)
# In[ ]:
# Input file:
inputjsonfile = 'comments.json'
col_id = 'commentable_id'
col_content = 'body'
# Output files:
comments_summaries_filename = 'ml_comments_summaries_proposals.json'
comments_summaries_filename_csv = 'ml_comments_summaries_proposals.csv'
tqdm_notebook = True
# In[3]:
import os
if os.environ.get("CONSUL_TENANT"):
data_path = '../../tenants/' + os.environ["CONSUL_TENANT"] + '/machine_learning/data'
else:
data_path = '../data'
config_file = 'proposals_summary_comments_textrank.ini'
logging_file ='proposals_summary_comments_textrank.log'
# Read the configuration file
import configparser
config = configparser.ConfigParser()
check_file(os.path.join(data_path,config_file))
config.read(os.path.join(data_path,config_file))
sent_token_lang = config['PREPROCESSING']['sent_token_lang']
stopwords_lang = config['PREPROCESSING']['stopwords_lang']
nltk_download = config['PREPROCESSING'].getboolean('nltk_download')
if stopwords_lang == 'spanish':
glove_file = config['SUMMARISATION']['glove_file_es']
if stopwords_lang == 'english':
glove_file = config['SUMMARISATION']['glove_file_en']
threshold_factor = config['SUMMARISATION'].getfloat('threshold_factor')
max_size_of_summaries = config['SUMMARISATION'].getint('max_size_of_summaries')
logging_level = config['LOGGING']['logging_level']
# In[5]:
import logging
logging.basicConfig(filename=os.path.join(data_path,logging_file),
filemode='w',
format='%(asctime)s - %(levelname)s - %(message)s',
level=logging_level)
#logging.info('message')
# In[6]:
import os
import pandas as pd
import numpy as np
import re
from unicodedata import normalize
import sys
# In[7]:
import nltk
if nltk_download:
nltk.download('stopwords')
nltk.download('punkt')
# In[8]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
# In[9]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
# In[10]:
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import collections
# In[11]:
import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
# to use tqdm in pandas use progress_apply instead of apply
# In[12]:
# Different code for Spanish and English vectors
# Extract word vectors
check_file(os.path.join(data_path,glove_file))
if stopwords_lang == 'english':
non_keyed_embs = os.path.join(data_path,glove_file)
keyed_embs = os.path.join(data_path,glove_file+'.vec')
if (not os.path.isfile(keyed_embs)) and (os.path.isfile(non_keyed_embs)):
glove2word2vec(non_keyed_embs, keyed_embs)
glove_file = glove_file+'.vec'
word_embeddings = {}
f = open(os.path.join(data_path,glove_file), encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
word_embeddings[word] = coefs
f.close()
# In[ ]:
# # Read the comments and join the comments belonging to the same proposal
# In[ ]:
check_file(os.path.join(data_path,inputjsonfile))
comments_input_df = pd.read_json(os.path.join(data_path,inputjsonfile),orient="records")
comments_input_df = comments_input_df[comments_input_df['commentable_type'] == 'Proposal']
# In[ ]:
# TERMINATE THE SCRIPT IF THERE ARE NO PROPOSALS
if len(comments_input_df) == 0:
logging.info('No Proposals comments found to summarise.')
with open(os.path.join(data_path,comments_summaries_filename), 'w') as file:
file.write('[]')
os._exit(0)
# In[13]:
comments_input_df = comments_input_df[[col_id]+[col_content]]
# Normalise characters
comments_input_df[col_content] = comments_input_df[col_content].apply(lambda x: normalize('NFKC',x))
comments_input_df = comments_input_df.sort_values(by=col_id)
comments_input_df.reset_index(drop=True,inplace=True)
# In[14]:
# Drop empty texts
empty_txt_ids = []
for idx,row in comments_input_df.iterrows():
if row['body'].strip() == '':
empty_txt_ids.append(idx)
comments_input_df = comments_input_df.drop(empty_txt_ids)
comments_input_df.reset_index(drop=True,inplace=True)
# In[15]:
comments_df = pd.DataFrame()
temp_comments_joined = []
temp_comments_number = []
temp_proposal_id = []
for prop_id in sorted(list(set(comments_input_df[col_id].tolist()))):
temp_list = comments_input_df[comments_input_df[col_id] == prop_id][col_content].tolist()
temp_comments_joined.append('\n'.join(temp_list))
temp_comments_number.append(len(temp_list))
temp_proposal_id.append(prop_id)
comments_df['prop_id'] = temp_proposal_id
comments_df['comments_joined'] = temp_comments_joined
comments_df['comments_number'] = temp_comments_number
# In[16]:
# # Stats
# print(len(comments_df))
# print(len(comments_df[(comments_df['comments_number'] >= 0) & (comments_df['comments_number'] < 10)]))
# print(len(comments_df[(comments_df['comments_number'] >= 10) & (comments_df['comments_number'] < 50)]))
# print(len(comments_df[(comments_df['comments_number'] >= 50) & (comments_df['comments_number'] < 900)]))
# In[ ]:
# # Make comments lowercase
# In[17]:
comments_df['comments_joined'] = comments_df['comments_joined'].apply(lambda x: x.lower())
# In[ ]:
# # Split sentences
# In[18]:
def split_sentences(txt):
new_text_1 = sent_tokenize(txt,sent_token_lang)
#outputs [] if txt is ''; or made of ' ' or '\n'
new_text_2 = []
if new_text_1 != []:
for tok1 in new_text_1:
new_text_2 += tok1.split('\n')
#outputs [''] if txt is ''
new_text_2 = [tok.strip() for tok in new_text_2 if tok.strip() != '']
if new_text_2 == []:
new_text_2 = ['']
return new_text_2
# In[19]:
comments_df['comments_sentences'] = comments_df['comments_joined'].apply(split_sentences)
# In[ ]:
# # Calculate sentence embeddings
# In[20]:
# Includes some extra steps for Spanish
# List of stop words to be removed
stop_words = set(stopwords.words(stopwords_lang))
if stopwords_lang == 'spanish':
for word in stop_words:
stop_words = stop_words.union({re.sub(r"á","a",word)})
stop_words = stop_words.union({re.sub(r"é","e",word)})
stop_words = stop_words.union({re.sub(r"í","i",word)})
stop_words = stop_words.union({re.sub(r"ó","o",word)})
stop_words = stop_words.union({re.sub(r"ú","u",word)})
# additional terms removed when found as an independent character
if stopwords_lang == 'spanish':
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';','d','q','u'}
else:
additional_stop_words = {'(',')',',','.','...','?','¿','!','¡',':',';'}
all_stop_words = stop_words.union(additional_stop_words)
# In[21]:
def sentences_embeddings(sents):
sent_embs = []
for sent in sents:
words = set(word_tokenize(sent))
words = words-all_stop_words
if len(words) != 0:
emb = sum([word_embeddings.get(word, np.zeros(300)) for word in words])/(
len(words)+0.001)
else:
emb = np.zeros(300)
sent_embs.append(emb)
return sent_embs
# In[22]:
if tqdm_notebook:
comments_df['comments_sentences_embeddings'] = comments_df[
'comments_sentences'].progress_apply(sentences_embeddings)
else:
comments_df['comments_sentences_embeddings'] = comments_df[
'comments_sentences'].apply(sentences_embeddings)
# In[ ]:
# # Calculate sentence scores
# In[23]:
def sentences_scores(sents, sent_embs):
# similarity matrix
if len(sent_embs) > 1:
stacked_sent_embs = np.stack(sent_embs)
sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
np.fill_diagonal(sim_mat, 0)
elif len(sent_embs) == 1:
sim_mat = np.array([[0.]])
else:
return collections.OrderedDict([('',1.0)])
nx_graph = nx.from_numpy_array(sim_mat)
try:
sentence_weight_temp = nx.pagerank(nx_graph)
except:
sentence_weight_temp = dict.fromkeys([x for x in range(len(sents))], 0)
sentence_weights = {sents[key]: value for key, value in sentence_weight_temp.items()}
sorted_sentence_weights = sorted(sentence_weights.items(), key=lambda elem: elem[1], reverse=True)
sentence_scores = collections.OrderedDict(sorted_sentence_weights)
return sentence_scores
# In[24]:
def plot_sentences_network(sents, sent_embs):
import matplotlib.pyplot as plt
# similarity matrix
if len(sent_embs) > 1:
stacked_sent_embs = np.stack(sent_embs)
sim_mat = cosine_similarity(stacked_sent_embs,stacked_sent_embs)
np.fill_diagonal(sim_mat, 0)
elif len(sent_embs) == 1:
sim_mat = np.array([[0.]])
else:
print('Nothing to plot')
return
nx_graph = nx.from_numpy_array(sim_mat)
plt.plot()
nx.draw(nx_graph, with_labels=True)
# In[25]:
comments_df['comments_sentences_scores'] = comments_df[['comments_sentences','comments_sentences_embeddings']].progress_apply(
lambda row: sentences_scores(row['comments_sentences'],row['comments_sentences_embeddings']),axis=1)
# In[ ]:
# # Generate the summaries
# In[26]:
def comments_summary(sentence_weight, threshold_factor, *totalwords):
threshold = threshold_factor * np.mean(list(sentence_weight.values()))
sentence_counter = 0
comments_summary = ''
summary_num_words = 0
for sentence in sentence_weight:
if sentence_weight[sentence] >= (threshold):
if len(totalwords) == 0:
comments_summary += "\n- " + sentence
sentence_counter += 1
elif summary_num_words < totalwords[0]:
comments_summary += "\n- " + sentence
sentence_counter += 1
summary_num_words += len(sentence.split())
comments_summary = comments_summary.lstrip()
return comments_summary
# In[27]:
comments_df['comments_summary'] = comments_df['comments_sentences_scores'].apply(
lambda x: comments_summary(x,threshold_factor,max_size_of_summaries))
# In[28]:
# comments_df
# In[29]:
# for idx,row in comments_input_df[comments_input_df['commentable_id'] == 10].iterrows():
# print(row['body'])
# print('-------')
# In[30]:
#print(comments_df.loc[8,'comments_summary'])
# In[ ]:
# In[31]:
comments_df['commentable_type'] = ['Proposal']*len(comments_df)
comments_summaries_df = comments_df[['prop_id','commentable_type','comments_summary']]
comments_summaries_df.reset_index(level=0, inplace=True)
# In[32]:
comments_summaries_df = comments_summaries_df.rename(
columns={"index": "id", "prop_id": "commentable_id", "comments_summary": "body"})
# In[33]:
#comments_summaries_df
# In[34]:
comments_summaries_df.to_json(os.path.join(data_path,comments_summaries_filename),orient="records", force_ascii=False)
comments_summaries_df.to_csv(os.path.join(data_path,comments_summaries_filename_csv), index=False)
# In[ ]:
# In[ ]:
# In[35]:
logging.info('Script executed correctly.')