Update machine learning ini files

2021-09-07 18:24:04 +02:00
parent 6d6888f201
commit b6f0b1b063
4 changed files with 90 additions and 0 deletions
--- a/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.ini
+++ b/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.ini
@@ -0,0 +1,30 @@
+[PREPROCESSING]
+#stanza_model_lang = es
+stanza_model_lang = en
+#stopwords_lang = spanish
+stopwords_lang = english
+noun_lemmatisation = True
+n_gram_min_count = 50
+stanza_download = True
+nltk_download = True
+
+[RELATED_PROPOSALS]
+# Max number of related proposals to find for each proposal:
+numb_related_proposals = 2
+
+[TOPIC_MODELLING]
+# Number of topics:
+numb_topics = 3
+# Number of tags/keywords for each topic:
+numb_topkeywords_pertopic = 5
+# Number of top representative proposals to extract for each topic:
+n_top_represent_props = 2
+# Consider only the top 'n_features' words of the corpus (ordered by word frequency):
+n_features = 10000
+# Ignore the words that appear in < 'min_df_val' percent of documents (ratio in [0.0, 1.0] interval):
+min_df_val = 0.01
+# Ignore the words that appear in > 'max_df_val' percent of documents (ratio in [0.0, 1.0] interval):
+max_df_val = 0.9
+
+[LOGGING]
+logging_level=INFO
--- a/public/machine_learning/scripts/budgets_summary_comments_textrank.ini
+++ b/public/machine_learning/scripts/budgets_summary_comments_textrank.ini
@@ -0,0 +1,15 @@
+[PREPROCESSING]
+#stopwords_lang = spanish
+stopwords_lang = english
+#sent_token_lang = spanish
+sent_token_lang = english
+nltk_download = True
+
+[SUMMARISATION]
+glove_file_es = glove-sbwc.i25.vec
+glove_file_en = glove.6B.300d.txt
+threshold_factor = 1.0
+max_size_of_summaries = 50
+
+[LOGGING]
+logging_level=INFO
--- a/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.ini
+++ b/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.ini
@@ -0,0 +1,30 @@
+[PREPROCESSING]
+#stanza_model_lang = es
+stanza_model_lang = en
+#stopwords_lang = spanish
+stopwords_lang = english
+noun_lemmatisation = True
+n_gram_min_count = 50
+stanza_download = True
+nltk_download = True
+
+[RELATED_PROPOSALS]
+# Max number of related proposals to find for each proposal:
+numb_related_proposals = 2
+
+[TOPIC_MODELLING]
+# Number of topics:
+numb_topics = 3
+# Number of tags/keywords for each topic:
+numb_topkeywords_pertopic = 5
+# Number of top representative proposals to extract for each topic:
+n_top_represent_props = 2
+# Consider only the top 'n_features' words of the corpus (ordered by word frequency):
+n_features = 10000
+# Ignore the words that appear in < 'min_df_val' percent of documents (ratio in [0.0, 1.0] interval):
+min_df_val = 0.01
+# Ignore the words that appear in > 'max_df_val' percent of documents (ratio in [0.0, 1.0] interval):
+max_df_val = 0.9
+
+[LOGGING]
+logging_level=INFO
--- a/public/machine_learning/scripts/proposals_summary_comments_textrank.ini
+++ b/public/machine_learning/scripts/proposals_summary_comments_textrank.ini
@@ -0,0 +1,15 @@
+[PREPROCESSING]
+#stopwords_lang = spanish
+stopwords_lang = english
+#sent_token_lang = spanish
+sent_token_lang = english
+nltk_download = True
+
+[SUMMARISATION]
+glove_file_es = glove-sbwc.i25.vec
+glove_file_en = glove.6B.300d.txt
+threshold_factor = 1.0
+max_size_of_summaries = 50
+
+[LOGGING]
+logging_level=INFO