From 2f312bf4742c87067f023ddf5ad62e83afdc6197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javi=20Mart=C3=ADn?= Date: Wed, 19 Oct 2022 01:41:07 +0200 Subject: [PATCH] Use a different machine learning folder per tenant We're using the "tenants" subfolder for consistency with the folder structure we use in ActiveStorage and because some CONSUL installations might have folders inside the `data` folder which might conflict with the folders created by tenants. Note that the Python scripts have a lot of duplication, meaning we need to change all of them. I'm not refactoring them because I'm not familiar enough with these scripts (or with Python, for that matter). Also note that the scripts folder is still shared by all tenants, meaning it isn't possible to have different scripts for different tenants. I'm not sure how this situation should be handled; again, I'm not familiar enough with this feature. --- .dockerignore | 1 + .gitignore | 1 + app/models/machine_learning.rb | 81 ++++++++++++------- .../budgets_related_content_and_tags_nmf.py | 7 +- .../budgets_summary_comments_textrank.py | 7 +- .../proposals_related_content_and_tags_nmf.py | 7 +- .../proposals_summary_comments_textrank.py | 7 +- spec/models/machine_learning_spec.rb | 22 ++--- spec/system/admin/machine_learning_spec.rb | 6 +- 9 files changed, 89 insertions(+), 50 deletions(-) diff --git a/.dockerignore b/.dockerignore index 41853f5ae..8c75b254c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -23,6 +23,7 @@ public/sitemap.xml public/tenants/*/sitemap.xml public/assets/ public/machine_learning/data/ +public/tenants/*/machine_learning/data/ # Bundler config, cache and gemsets **/.bundle/ diff --git a/.gitignore b/.gitignore index 3ac23e93f..28d1c676c 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ tmp/ /public/tenants/*/sitemap.xml /public/assets/ /public/machine_learning/data/ +/public/tenants/*/machine_learning/data/ # Bundler config, cache and gemsets .bundle/ diff --git a/app/models/machine_learning.rb b/app/models/machine_learning.rb index 76be044b8..0b680e212 100644 --- a/app/models/machine_learning.rb +++ b/app/models/machine_learning.rb @@ -3,7 +3,6 @@ class MachineLearning attr_accessor :job SCRIPTS_FOLDER = Rails.root.join("public", "machine_learning", "scripts").freeze - DATA_FOLDER = Rails.root.join("public", "machine_learning", "data").freeze def initialize(job) @job = job @@ -11,6 +10,10 @@ class MachineLearning @previous_modified_date = set_previous_modified_date end + def data_folder + self.class.data_folder + end + def run begin export_proposals_to_json @@ -81,17 +84,33 @@ class MachineLearning "comments.json" end + def data_folder + Rails.root.join("public", tenant_data_folder) + end + + def tenant_data_folder + File.join(tenant_subfolder, "machine_learning", "data").delete_prefix("/") + end + + def tenant_subfolder + if Tenant.default? + "" + else + File.join("tenants", Tenant.current_schema) + end + end + def data_output_files files = { tags: [], related_content: [], comments_summary: [] } - files[:tags] << proposals_tags_filename if File.exists?(DATA_FOLDER.join(proposals_tags_filename)) - files[:tags] << proposals_taggings_filename if File.exists?(DATA_FOLDER.join(proposals_taggings_filename)) - files[:tags] << investments_tags_filename if File.exists?(DATA_FOLDER.join(investments_tags_filename)) - files[:tags] << investments_taggings_filename if File.exists?(DATA_FOLDER.join(investments_taggings_filename)) - files[:related_content] << proposals_related_filename if File.exists?(DATA_FOLDER.join(proposals_related_filename)) - files[:related_content] << investments_related_filename if File.exists?(DATA_FOLDER.join(investments_related_filename)) - files[:comments_summary] << proposals_comments_summary_filename if File.exists?(DATA_FOLDER.join(proposals_comments_summary_filename)) - files[:comments_summary] << investments_comments_summary_filename if File.exists?(DATA_FOLDER.join(investments_comments_summary_filename)) + files[:tags] << proposals_tags_filename if File.exists?(data_folder.join(proposals_tags_filename)) + files[:tags] << proposals_taggings_filename if File.exists?(data_folder.join(proposals_taggings_filename)) + files[:tags] << investments_tags_filename if File.exists?(data_folder.join(investments_tags_filename)) + files[:tags] << investments_taggings_filename if File.exists?(data_folder.join(investments_taggings_filename)) + files[:related_content] << proposals_related_filename if File.exists?(data_folder.join(proposals_related_filename)) + files[:related_content] << investments_related_filename if File.exists?(data_folder.join(investments_related_filename)) + files[:comments_summary] << proposals_comments_summary_filename if File.exists?(data_folder.join(proposals_comments_summary_filename)) + files[:comments_summary] << investments_comments_summary_filename if File.exists?(data_folder.join(investments_comments_summary_filename)) files end @@ -110,10 +129,10 @@ class MachineLearning proposals_comments_summary_filename, investments_comments_summary_filename ] - json = Dir[DATA_FOLDER.join("*.json")].map do |full_path_filename| + json = Dir[data_folder.join("*.json")].map do |full_path_filename| full_path_filename.split("/").last end - csv = Dir[DATA_FOLDER.join("*.csv")].map do |full_path_filename| + csv = Dir[data_folder.join("*.csv")].map do |full_path_filename| full_path_filename.split("/").last end (json + csv - excluded).sort @@ -152,7 +171,7 @@ class MachineLearning end def data_path(filename) - "/machine_learning/data/" + filename + "/#{tenant_data_folder}/#{filename}" end def script_kinds @@ -196,29 +215,35 @@ class MachineLearning private def create_data_folder - FileUtils.mkdir_p DATA_FOLDER + FileUtils.mkdir_p data_folder end def export_proposals_to_json create_data_folder - filename = DATA_FOLDER.join(MachineLearning.proposals_filename) + filename = data_folder.join(MachineLearning.proposals_filename) Proposal::Exporter.new.to_json_file(filename) end def export_budget_investments_to_json create_data_folder - filename = DATA_FOLDER.join(MachineLearning.investments_filename) + filename = data_folder.join(MachineLearning.investments_filename) Budget::Investment::Exporter.new(Array.new).to_json_file(filename) end def export_comments_to_json create_data_folder - filename = DATA_FOLDER.join(MachineLearning.comments_filename) + filename = data_folder.join(MachineLearning.comments_filename) Comment::Exporter.new.to_json_file(filename) end def run_machine_learning_scripts - output = `cd #{SCRIPTS_FOLDER} && python #{job.script} 2>&1` + command = if Tenant.default? + "python #{job.script}" + else + "CONSUL_TENANT=#{Tenant.current_schema} python #{job.script}" + end + + output = `cd #{SCRIPTS_FOLDER} && #{command} 2>&1` result = $?.success? if result == false job.update!(finished_at: Time.current, error: output) @@ -254,7 +279,7 @@ class MachineLearning end def import_ml_proposals_comments_summary - json_file = DATA_FOLDER.join(MachineLearning.proposals_comments_summary_filename) + json_file = data_folder.join(MachineLearning.proposals_comments_summary_filename) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data.each do |attributes| attributes.delete(:id) @@ -266,7 +291,7 @@ class MachineLearning end def import_ml_investments_comments_summary - json_file = DATA_FOLDER.join(MachineLearning.investments_comments_summary_filename) + json_file = data_folder.join(MachineLearning.investments_comments_summary_filename) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data.each do |attributes| attributes.delete(:id) @@ -278,7 +303,7 @@ class MachineLearning end def import_proposals_related_content - json_file = DATA_FOLDER.join(MachineLearning.proposals_related_filename) + json_file = data_folder.join(MachineLearning.proposals_related_filename) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data.each do |related| id = related.delete(:id) @@ -306,7 +331,7 @@ class MachineLearning end def import_budget_investments_related_content - json_file = DATA_FOLDER.join(MachineLearning.investments_related_filename) + json_file = data_folder.join(MachineLearning.investments_related_filename) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data.each do |related| id = related.delete(:id) @@ -335,7 +360,7 @@ class MachineLearning def import_ml_proposals_tags ids = {} - json_file = DATA_FOLDER.join(MachineLearning.proposals_tags_filename) + json_file = data_folder.join(MachineLearning.proposals_tags_filename) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data.each do |attributes| if attributes[:name].present? @@ -348,7 +373,7 @@ class MachineLearning end end - json_file = DATA_FOLDER.join(MachineLearning.proposals_taggings_filename) + json_file = data_folder.join(MachineLearning.proposals_taggings_filename) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data.each do |attributes| if attributes[:tag_id].present? @@ -365,7 +390,7 @@ class MachineLearning def import_ml_investments_tags ids = {} - json_file = DATA_FOLDER.join(MachineLearning.investments_tags_filename) + json_file = data_folder.join(MachineLearning.investments_tags_filename) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data.each do |attributes| if attributes[:name].present? @@ -378,7 +403,7 @@ class MachineLearning end end - json_file = DATA_FOLDER.join(MachineLearning.investments_taggings_filename) + json_file = data_folder.join(MachineLearning.investments_taggings_filename) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data.each do |attributes| if attributes[:tag_id].present? @@ -421,13 +446,13 @@ class MachineLearning end def last_modified_date_for(filename) - return nil unless File.exists? DATA_FOLDER.join(filename) + return nil unless File.exists? data_folder.join(filename) - File.mtime DATA_FOLDER.join(filename) + File.mtime data_folder.join(filename) end def updated_file?(filename) - return false unless File.exists? DATA_FOLDER.join(filename) + return false unless File.exists? data_folder.join(filename) return true unless previous_modified_date[filename].present? last_modified_date_for(filename) > previous_modified_date[filename] diff --git a/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py b/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py index da40f73b3..d43cfbc70 100644 --- a/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py +++ b/public/machine_learning/scripts/budgets_related_content_and_tags_nmf.py @@ -63,14 +63,17 @@ tqdm_notebook = True # In[2]: +import os +if os.environ.get("CONSUL_TENANT"): + data_path = '../../tenants/' + os.environ["CONSUL_TENANT"] + '/machine_learning/data' +else: + data_path = '../data' -data_path = '../data' config_file = 'budgets_related_content_and_tags_nmf.ini' logging_file ='budgets_related_content_and_tags_nmf.log' # Read the configuration file -import os import configparser config = configparser.ConfigParser() check_file(os.path.join(data_path,config_file)) diff --git a/public/machine_learning/scripts/budgets_summary_comments_textrank.py b/public/machine_learning/scripts/budgets_summary_comments_textrank.py index 1c0faf07b..a1dec2b6f 100644 --- a/public/machine_learning/scripts/budgets_summary_comments_textrank.py +++ b/public/machine_learning/scripts/budgets_summary_comments_textrank.py @@ -60,14 +60,17 @@ tqdm_notebook = True # In[ ]: +import os +if os.environ.get("CONSUL_TENANT"): + data_path = '../../tenants/' + os.environ["CONSUL_TENANT"] + '/machine_learning/data' +else: + data_path = '../data' -data_path = '../data' config_file = 'budgets_summary_comments_textrank.ini' logging_file ='budgets_summary_comments_textrank.log' # Read the configuration file -import os import configparser config = configparser.ConfigParser() check_file(os.path.join(data_path,config_file)) diff --git a/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py b/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py index 4c303ad28..df0af7945 100644 --- a/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py +++ b/public/machine_learning/scripts/proposals_related_content_and_tags_nmf.py @@ -63,14 +63,17 @@ tqdm_notebook = True # In[2]: +import os +if os.environ.get("CONSUL_TENANT"): + data_path = '../../tenants/' + os.environ["CONSUL_TENANT"] + '/machine_learning/data' +else: + data_path = '../data' -data_path = '../data' config_file = 'proposals_related_content_and_tags_nmf.ini' logging_file ='proposals_related_content_and_tags_nmf.log' # Read the configuration file -import os import configparser config = configparser.ConfigParser() check_file(os.path.join(data_path,config_file)) diff --git a/public/machine_learning/scripts/proposals_summary_comments_textrank.py b/public/machine_learning/scripts/proposals_summary_comments_textrank.py index 440083558..ac5d2569a 100644 --- a/public/machine_learning/scripts/proposals_summary_comments_textrank.py +++ b/public/machine_learning/scripts/proposals_summary_comments_textrank.py @@ -60,14 +60,17 @@ tqdm_notebook = True # In[3]: +import os +if os.environ.get("CONSUL_TENANT"): + data_path = '../../tenants/' + os.environ["CONSUL_TENANT"] + '/machine_learning/data' +else: + data_path = '../data' -data_path = '../data' config_file = 'proposals_summary_comments_textrank.ini' logging_file ='proposals_summary_comments_textrank.log' # Read the configuration file -import os import configparser config = configparser.ConfigParser() check_file(os.path.join(data_path,config_file)) diff --git a/spec/models/machine_learning_spec.rb b/spec/models/machine_learning_spec.rb index da0788e15..f3f8af8f7 100644 --- a/spec/models/machine_learning_spec.rb +++ b/spec/models/machine_learning_spec.rb @@ -309,7 +309,7 @@ describe MachineLearning do machine_learning = MachineLearning.new(job) machine_learning.send(:export_proposals_to_json) - json_file = MachineLearning::DATA_FOLDER.join("proposals.json") + json_file = MachineLearning.data_folder.join("proposals.json") json = JSON.parse(File.read(json_file)) expect(json).to be_an Array @@ -335,7 +335,7 @@ describe MachineLearning do machine_learning = MachineLearning.new(job) machine_learning.send(:export_budget_investments_to_json) - json_file = MachineLearning::DATA_FOLDER.join("budget_investments.json") + json_file = MachineLearning.data_folder.join("budget_investments.json") json = JSON.parse(File.read(json_file)) expect(json).to be_an Array @@ -359,7 +359,7 @@ describe MachineLearning do machine_learning = MachineLearning.new(job) machine_learning.send(:export_comments_to_json) - json_file = MachineLearning::DATA_FOLDER.join("comments.json") + json_file = MachineLearning.data_folder.join("comments.json") json = JSON.parse(File.read(json_file)) expect(json).to be_an Array @@ -428,7 +428,7 @@ describe MachineLearning do ] filename = "ml_comments_summaries_proposals.json" - json_file = MachineLearning::DATA_FOLDER.join(filename) + json_file = MachineLearning.data_folder.join(filename) expect(File).to receive(:read).with(json_file).and_return data.to_json machine_learning.send(:import_ml_proposals_comments_summary) @@ -450,7 +450,7 @@ describe MachineLearning do ] filename = "ml_comments_summaries_budgets.json" - json_file = MachineLearning::DATA_FOLDER.join(filename) + json_file = MachineLearning.data_folder.join(filename) expect(File).to receive(:read).with(json_file).and_return data.to_json machine_learning.send(:import_ml_investments_comments_summary) @@ -476,7 +476,7 @@ describe MachineLearning do ] filename = "ml_related_content_proposals.json" - json_file = MachineLearning::DATA_FOLDER.join(filename) + json_file = MachineLearning.data_folder.join(filename) expect(File).to receive(:read).with(json_file).and_return data.to_json machine_learning.send(:import_proposals_related_content) @@ -504,7 +504,7 @@ describe MachineLearning do ] filename = "ml_related_content_budgets.json" - json_file = MachineLearning::DATA_FOLDER.join(filename) + json_file = MachineLearning.data_folder.join(filename) expect(File).to receive(:read).with(json_file).and_return data.to_json machine_learning.send(:import_budget_investments_related_content) @@ -538,11 +538,11 @@ describe MachineLearning do ] tags_filename = "ml_tags_proposals.json" - tags_json_file = MachineLearning::DATA_FOLDER.join(tags_filename) + tags_json_file = MachineLearning.data_folder.join(tags_filename) expect(File).to receive(:read).with(tags_json_file).and_return tags_data.to_json taggings_filename = "ml_taggings_proposals.json" - taggings_json_file = MachineLearning::DATA_FOLDER.join(taggings_filename) + taggings_json_file = MachineLearning.data_folder.join(taggings_filename) expect(File).to receive(:read).with(taggings_json_file).and_return taggings_data.to_json machine_learning.send(:import_ml_proposals_tags) @@ -580,11 +580,11 @@ describe MachineLearning do ] tags_filename = "ml_tags_budgets.json" - tags_json_file = MachineLearning::DATA_FOLDER.join(tags_filename) + tags_json_file = MachineLearning.data_folder.join(tags_filename) expect(File).to receive(:read).with(tags_json_file).and_return tags_data.to_json taggings_filename = "ml_taggings_budgets.json" - taggings_json_file = MachineLearning::DATA_FOLDER.join(taggings_filename) + taggings_json_file = MachineLearning.data_folder.join(taggings_filename) expect(File).to receive(:read).with(taggings_json_file).and_return taggings_data.to_json machine_learning.send(:import_ml_investments_tags) diff --git a/spec/system/admin/machine_learning_spec.rb b/spec/system/admin/machine_learning_spec.rb index 03a476026..d058f8eb9 100644 --- a/spec/system/admin/machine_learning_spec.rb +++ b/spec/system/admin/machine_learning_spec.rb @@ -207,7 +207,7 @@ describe "Machine learning" do end scenario "Show output files info on settins page" do - FileUtils.mkdir_p MachineLearning::DATA_FOLDER + FileUtils.mkdir_p MachineLearning.data_folder allow_any_instance_of(MachineLearning).to receive(:run) do MachineLearningJob.first.update!(finished_at: 2.minutes.from_now) @@ -215,9 +215,9 @@ describe "Machine learning" do script: "proposals_summary_comments_textrank.py", kind: "comments_summary", updated_at: 2.minutes.from_now) - comments_file = MachineLearning::DATA_FOLDER.join(MachineLearning.comments_filename) + comments_file = MachineLearning.data_folder.join(MachineLearning.comments_filename) File.write(comments_file, [].to_json) - proposals_comments_summary_file = MachineLearning::DATA_FOLDER.join(MachineLearning.proposals_comments_summary_filename) + proposals_comments_summary_file = MachineLearning.data_folder.join(MachineLearning.proposals_comments_summary_filename) File.write(proposals_comments_summary_file, [].to_json) end