Use a different machine learning folder per tenant

We're using the "tenants" subfolder for consistency with the folder
structure we use in ActiveStorage and because some CONSUL installations
might have folders inside the `data` folder which might conflict with
the folders created by tenants.

Note that the Python scripts have a lot of duplication, meaning we need
to change all of them. I'm not refactoring them because I'm not familiar
enough with these scripts (or with Python, for that matter).

Also note that the scripts folder is still shared by all tenants,
meaning it isn't possible to have different scripts for different
tenants. I'm not sure how this situation should be handled; again, I'm
not familiar enough with this feature.
This commit is contained in:
Javi Martín
2022-10-19 01:41:07 +02:00
parent 58c9e8462d
commit 2f312bf474
9 changed files with 89 additions and 50 deletions

View File

@@ -23,6 +23,7 @@ public/sitemap.xml
public/tenants/*/sitemap.xml public/tenants/*/sitemap.xml
public/assets/ public/assets/
public/machine_learning/data/ public/machine_learning/data/
public/tenants/*/machine_learning/data/
# Bundler config, cache and gemsets # Bundler config, cache and gemsets
**/.bundle/ **/.bundle/

1
.gitignore vendored
View File

@@ -25,6 +25,7 @@ tmp/
/public/tenants/*/sitemap.xml /public/tenants/*/sitemap.xml
/public/assets/ /public/assets/
/public/machine_learning/data/ /public/machine_learning/data/
/public/tenants/*/machine_learning/data/
# Bundler config, cache and gemsets # Bundler config, cache and gemsets
.bundle/ .bundle/

View File

@@ -3,7 +3,6 @@ class MachineLearning
attr_accessor :job attr_accessor :job
SCRIPTS_FOLDER = Rails.root.join("public", "machine_learning", "scripts").freeze SCRIPTS_FOLDER = Rails.root.join("public", "machine_learning", "scripts").freeze
DATA_FOLDER = Rails.root.join("public", "machine_learning", "data").freeze
def initialize(job) def initialize(job)
@job = job @job = job
@@ -11,6 +10,10 @@ class MachineLearning
@previous_modified_date = set_previous_modified_date @previous_modified_date = set_previous_modified_date
end end
def data_folder
self.class.data_folder
end
def run def run
begin begin
export_proposals_to_json export_proposals_to_json
@@ -81,17 +84,33 @@ class MachineLearning
"comments.json" "comments.json"
end end
def data_folder
Rails.root.join("public", tenant_data_folder)
end
def tenant_data_folder
File.join(tenant_subfolder, "machine_learning", "data").delete_prefix("/")
end
def tenant_subfolder
if Tenant.default?
""
else
File.join("tenants", Tenant.current_schema)
end
end
def data_output_files def data_output_files
files = { tags: [], related_content: [], comments_summary: [] } files = { tags: [], related_content: [], comments_summary: [] }
files[:tags] << proposals_tags_filename if File.exists?(DATA_FOLDER.join(proposals_tags_filename)) files[:tags] << proposals_tags_filename if File.exists?(data_folder.join(proposals_tags_filename))
files[:tags] << proposals_taggings_filename if File.exists?(DATA_FOLDER.join(proposals_taggings_filename)) files[:tags] << proposals_taggings_filename if File.exists?(data_folder.join(proposals_taggings_filename))
files[:tags] << investments_tags_filename if File.exists?(DATA_FOLDER.join(investments_tags_filename)) files[:tags] << investments_tags_filename if File.exists?(data_folder.join(investments_tags_filename))
files[:tags] << investments_taggings_filename if File.exists?(DATA_FOLDER.join(investments_taggings_filename)) files[:tags] << investments_taggings_filename if File.exists?(data_folder.join(investments_taggings_filename))
files[:related_content] << proposals_related_filename if File.exists?(DATA_FOLDER.join(proposals_related_filename)) files[:related_content] << proposals_related_filename if File.exists?(data_folder.join(proposals_related_filename))
files[:related_content] << investments_related_filename if File.exists?(DATA_FOLDER.join(investments_related_filename)) files[:related_content] << investments_related_filename if File.exists?(data_folder.join(investments_related_filename))
files[:comments_summary] << proposals_comments_summary_filename if File.exists?(DATA_FOLDER.join(proposals_comments_summary_filename)) files[:comments_summary] << proposals_comments_summary_filename if File.exists?(data_folder.join(proposals_comments_summary_filename))
files[:comments_summary] << investments_comments_summary_filename if File.exists?(DATA_FOLDER.join(investments_comments_summary_filename)) files[:comments_summary] << investments_comments_summary_filename if File.exists?(data_folder.join(investments_comments_summary_filename))
files files
end end
@@ -110,10 +129,10 @@ class MachineLearning
proposals_comments_summary_filename, proposals_comments_summary_filename,
investments_comments_summary_filename investments_comments_summary_filename
] ]
json = Dir[DATA_FOLDER.join("*.json")].map do |full_path_filename| json = Dir[data_folder.join("*.json")].map do |full_path_filename|
full_path_filename.split("/").last full_path_filename.split("/").last
end end
csv = Dir[DATA_FOLDER.join("*.csv")].map do |full_path_filename| csv = Dir[data_folder.join("*.csv")].map do |full_path_filename|
full_path_filename.split("/").last full_path_filename.split("/").last
end end
(json + csv - excluded).sort (json + csv - excluded).sort
@@ -152,7 +171,7 @@ class MachineLearning
end end
def data_path(filename) def data_path(filename)
"/machine_learning/data/" + filename "/#{tenant_data_folder}/#{filename}"
end end
def script_kinds def script_kinds
@@ -196,29 +215,35 @@ class MachineLearning
private private
def create_data_folder def create_data_folder
FileUtils.mkdir_p DATA_FOLDER FileUtils.mkdir_p data_folder
end end
def export_proposals_to_json def export_proposals_to_json
create_data_folder create_data_folder
filename = DATA_FOLDER.join(MachineLearning.proposals_filename) filename = data_folder.join(MachineLearning.proposals_filename)
Proposal::Exporter.new.to_json_file(filename) Proposal::Exporter.new.to_json_file(filename)
end end
def export_budget_investments_to_json def export_budget_investments_to_json
create_data_folder create_data_folder
filename = DATA_FOLDER.join(MachineLearning.investments_filename) filename = data_folder.join(MachineLearning.investments_filename)
Budget::Investment::Exporter.new(Array.new).to_json_file(filename) Budget::Investment::Exporter.new(Array.new).to_json_file(filename)
end end
def export_comments_to_json def export_comments_to_json
create_data_folder create_data_folder
filename = DATA_FOLDER.join(MachineLearning.comments_filename) filename = data_folder.join(MachineLearning.comments_filename)
Comment::Exporter.new.to_json_file(filename) Comment::Exporter.new.to_json_file(filename)
end end
def run_machine_learning_scripts def run_machine_learning_scripts
output = `cd #{SCRIPTS_FOLDER} && python #{job.script} 2>&1` command = if Tenant.default?
"python #{job.script}"
else
"CONSUL_TENANT=#{Tenant.current_schema} python #{job.script}"
end
output = `cd #{SCRIPTS_FOLDER} && #{command} 2>&1`
result = $?.success? result = $?.success?
if result == false if result == false
job.update!(finished_at: Time.current, error: output) job.update!(finished_at: Time.current, error: output)
@@ -254,7 +279,7 @@ class MachineLearning
end end
def import_ml_proposals_comments_summary def import_ml_proposals_comments_summary
json_file = DATA_FOLDER.join(MachineLearning.proposals_comments_summary_filename) json_file = data_folder.join(MachineLearning.proposals_comments_summary_filename)
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
json_data.each do |attributes| json_data.each do |attributes|
attributes.delete(:id) attributes.delete(:id)
@@ -266,7 +291,7 @@ class MachineLearning
end end
def import_ml_investments_comments_summary def import_ml_investments_comments_summary
json_file = DATA_FOLDER.join(MachineLearning.investments_comments_summary_filename) json_file = data_folder.join(MachineLearning.investments_comments_summary_filename)
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
json_data.each do |attributes| json_data.each do |attributes|
attributes.delete(:id) attributes.delete(:id)
@@ -278,7 +303,7 @@ class MachineLearning
end end
def import_proposals_related_content def import_proposals_related_content
json_file = DATA_FOLDER.join(MachineLearning.proposals_related_filename) json_file = data_folder.join(MachineLearning.proposals_related_filename)
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
json_data.each do |related| json_data.each do |related|
id = related.delete(:id) id = related.delete(:id)
@@ -306,7 +331,7 @@ class MachineLearning
end end
def import_budget_investments_related_content def import_budget_investments_related_content
json_file = DATA_FOLDER.join(MachineLearning.investments_related_filename) json_file = data_folder.join(MachineLearning.investments_related_filename)
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
json_data.each do |related| json_data.each do |related|
id = related.delete(:id) id = related.delete(:id)
@@ -335,7 +360,7 @@ class MachineLearning
def import_ml_proposals_tags def import_ml_proposals_tags
ids = {} ids = {}
json_file = DATA_FOLDER.join(MachineLearning.proposals_tags_filename) json_file = data_folder.join(MachineLearning.proposals_tags_filename)
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
json_data.each do |attributes| json_data.each do |attributes|
if attributes[:name].present? if attributes[:name].present?
@@ -348,7 +373,7 @@ class MachineLearning
end end
end end
json_file = DATA_FOLDER.join(MachineLearning.proposals_taggings_filename) json_file = data_folder.join(MachineLearning.proposals_taggings_filename)
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
json_data.each do |attributes| json_data.each do |attributes|
if attributes[:tag_id].present? if attributes[:tag_id].present?
@@ -365,7 +390,7 @@ class MachineLearning
def import_ml_investments_tags def import_ml_investments_tags
ids = {} ids = {}
json_file = DATA_FOLDER.join(MachineLearning.investments_tags_filename) json_file = data_folder.join(MachineLearning.investments_tags_filename)
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
json_data.each do |attributes| json_data.each do |attributes|
if attributes[:name].present? if attributes[:name].present?
@@ -378,7 +403,7 @@ class MachineLearning
end end
end end
json_file = DATA_FOLDER.join(MachineLearning.investments_taggings_filename) json_file = data_folder.join(MachineLearning.investments_taggings_filename)
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!) json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
json_data.each do |attributes| json_data.each do |attributes|
if attributes[:tag_id].present? if attributes[:tag_id].present?
@@ -421,13 +446,13 @@ class MachineLearning
end end
def last_modified_date_for(filename) def last_modified_date_for(filename)
return nil unless File.exists? DATA_FOLDER.join(filename) return nil unless File.exists? data_folder.join(filename)
File.mtime DATA_FOLDER.join(filename) File.mtime data_folder.join(filename)
end end
def updated_file?(filename) def updated_file?(filename)
return false unless File.exists? DATA_FOLDER.join(filename) return false unless File.exists? data_folder.join(filename)
return true unless previous_modified_date[filename].present? return true unless previous_modified_date[filename].present?
last_modified_date_for(filename) > previous_modified_date[filename] last_modified_date_for(filename) > previous_modified_date[filename]

View File

@@ -63,14 +63,17 @@ tqdm_notebook = True
# In[2]: # In[2]:
import os
if os.environ.get("CONSUL_TENANT"):
data_path = '../../tenants/' + os.environ["CONSUL_TENANT"] + '/machine_learning/data'
else:
data_path = '../data'
data_path = '../data'
config_file = 'budgets_related_content_and_tags_nmf.ini' config_file = 'budgets_related_content_and_tags_nmf.ini'
logging_file ='budgets_related_content_and_tags_nmf.log' logging_file ='budgets_related_content_and_tags_nmf.log'
# Read the configuration file # Read the configuration file
import os
import configparser import configparser
config = configparser.ConfigParser() config = configparser.ConfigParser()
check_file(os.path.join(data_path,config_file)) check_file(os.path.join(data_path,config_file))

View File

@@ -60,14 +60,17 @@ tqdm_notebook = True
# In[ ]: # In[ ]:
import os
if os.environ.get("CONSUL_TENANT"):
data_path = '../../tenants/' + os.environ["CONSUL_TENANT"] + '/machine_learning/data'
else:
data_path = '../data'
data_path = '../data'
config_file = 'budgets_summary_comments_textrank.ini' config_file = 'budgets_summary_comments_textrank.ini'
logging_file ='budgets_summary_comments_textrank.log' logging_file ='budgets_summary_comments_textrank.log'
# Read the configuration file # Read the configuration file
import os
import configparser import configparser
config = configparser.ConfigParser() config = configparser.ConfigParser()
check_file(os.path.join(data_path,config_file)) check_file(os.path.join(data_path,config_file))

View File

@@ -63,14 +63,17 @@ tqdm_notebook = True
# In[2]: # In[2]:
import os
if os.environ.get("CONSUL_TENANT"):
data_path = '../../tenants/' + os.environ["CONSUL_TENANT"] + '/machine_learning/data'
else:
data_path = '../data'
data_path = '../data'
config_file = 'proposals_related_content_and_tags_nmf.ini' config_file = 'proposals_related_content_and_tags_nmf.ini'
logging_file ='proposals_related_content_and_tags_nmf.log' logging_file ='proposals_related_content_and_tags_nmf.log'
# Read the configuration file # Read the configuration file
import os
import configparser import configparser
config = configparser.ConfigParser() config = configparser.ConfigParser()
check_file(os.path.join(data_path,config_file)) check_file(os.path.join(data_path,config_file))

View File

@@ -60,14 +60,17 @@ tqdm_notebook = True
# In[3]: # In[3]:
import os
if os.environ.get("CONSUL_TENANT"):
data_path = '../../tenants/' + os.environ["CONSUL_TENANT"] + '/machine_learning/data'
else:
data_path = '../data'
data_path = '../data'
config_file = 'proposals_summary_comments_textrank.ini' config_file = 'proposals_summary_comments_textrank.ini'
logging_file ='proposals_summary_comments_textrank.log' logging_file ='proposals_summary_comments_textrank.log'
# Read the configuration file # Read the configuration file
import os
import configparser import configparser
config = configparser.ConfigParser() config = configparser.ConfigParser()
check_file(os.path.join(data_path,config_file)) check_file(os.path.join(data_path,config_file))

View File

@@ -309,7 +309,7 @@ describe MachineLearning do
machine_learning = MachineLearning.new(job) machine_learning = MachineLearning.new(job)
machine_learning.send(:export_proposals_to_json) machine_learning.send(:export_proposals_to_json)
json_file = MachineLearning::DATA_FOLDER.join("proposals.json") json_file = MachineLearning.data_folder.join("proposals.json")
json = JSON.parse(File.read(json_file)) json = JSON.parse(File.read(json_file))
expect(json).to be_an Array expect(json).to be_an Array
@@ -335,7 +335,7 @@ describe MachineLearning do
machine_learning = MachineLearning.new(job) machine_learning = MachineLearning.new(job)
machine_learning.send(:export_budget_investments_to_json) machine_learning.send(:export_budget_investments_to_json)
json_file = MachineLearning::DATA_FOLDER.join("budget_investments.json") json_file = MachineLearning.data_folder.join("budget_investments.json")
json = JSON.parse(File.read(json_file)) json = JSON.parse(File.read(json_file))
expect(json).to be_an Array expect(json).to be_an Array
@@ -359,7 +359,7 @@ describe MachineLearning do
machine_learning = MachineLearning.new(job) machine_learning = MachineLearning.new(job)
machine_learning.send(:export_comments_to_json) machine_learning.send(:export_comments_to_json)
json_file = MachineLearning::DATA_FOLDER.join("comments.json") json_file = MachineLearning.data_folder.join("comments.json")
json = JSON.parse(File.read(json_file)) json = JSON.parse(File.read(json_file))
expect(json).to be_an Array expect(json).to be_an Array
@@ -428,7 +428,7 @@ describe MachineLearning do
] ]
filename = "ml_comments_summaries_proposals.json" filename = "ml_comments_summaries_proposals.json"
json_file = MachineLearning::DATA_FOLDER.join(filename) json_file = MachineLearning.data_folder.join(filename)
expect(File).to receive(:read).with(json_file).and_return data.to_json expect(File).to receive(:read).with(json_file).and_return data.to_json
machine_learning.send(:import_ml_proposals_comments_summary) machine_learning.send(:import_ml_proposals_comments_summary)
@@ -450,7 +450,7 @@ describe MachineLearning do
] ]
filename = "ml_comments_summaries_budgets.json" filename = "ml_comments_summaries_budgets.json"
json_file = MachineLearning::DATA_FOLDER.join(filename) json_file = MachineLearning.data_folder.join(filename)
expect(File).to receive(:read).with(json_file).and_return data.to_json expect(File).to receive(:read).with(json_file).and_return data.to_json
machine_learning.send(:import_ml_investments_comments_summary) machine_learning.send(:import_ml_investments_comments_summary)
@@ -476,7 +476,7 @@ describe MachineLearning do
] ]
filename = "ml_related_content_proposals.json" filename = "ml_related_content_proposals.json"
json_file = MachineLearning::DATA_FOLDER.join(filename) json_file = MachineLearning.data_folder.join(filename)
expect(File).to receive(:read).with(json_file).and_return data.to_json expect(File).to receive(:read).with(json_file).and_return data.to_json
machine_learning.send(:import_proposals_related_content) machine_learning.send(:import_proposals_related_content)
@@ -504,7 +504,7 @@ describe MachineLearning do
] ]
filename = "ml_related_content_budgets.json" filename = "ml_related_content_budgets.json"
json_file = MachineLearning::DATA_FOLDER.join(filename) json_file = MachineLearning.data_folder.join(filename)
expect(File).to receive(:read).with(json_file).and_return data.to_json expect(File).to receive(:read).with(json_file).and_return data.to_json
machine_learning.send(:import_budget_investments_related_content) machine_learning.send(:import_budget_investments_related_content)
@@ -538,11 +538,11 @@ describe MachineLearning do
] ]
tags_filename = "ml_tags_proposals.json" tags_filename = "ml_tags_proposals.json"
tags_json_file = MachineLearning::DATA_FOLDER.join(tags_filename) tags_json_file = MachineLearning.data_folder.join(tags_filename)
expect(File).to receive(:read).with(tags_json_file).and_return tags_data.to_json expect(File).to receive(:read).with(tags_json_file).and_return tags_data.to_json
taggings_filename = "ml_taggings_proposals.json" taggings_filename = "ml_taggings_proposals.json"
taggings_json_file = MachineLearning::DATA_FOLDER.join(taggings_filename) taggings_json_file = MachineLearning.data_folder.join(taggings_filename)
expect(File).to receive(:read).with(taggings_json_file).and_return taggings_data.to_json expect(File).to receive(:read).with(taggings_json_file).and_return taggings_data.to_json
machine_learning.send(:import_ml_proposals_tags) machine_learning.send(:import_ml_proposals_tags)
@@ -580,11 +580,11 @@ describe MachineLearning do
] ]
tags_filename = "ml_tags_budgets.json" tags_filename = "ml_tags_budgets.json"
tags_json_file = MachineLearning::DATA_FOLDER.join(tags_filename) tags_json_file = MachineLearning.data_folder.join(tags_filename)
expect(File).to receive(:read).with(tags_json_file).and_return tags_data.to_json expect(File).to receive(:read).with(tags_json_file).and_return tags_data.to_json
taggings_filename = "ml_taggings_budgets.json" taggings_filename = "ml_taggings_budgets.json"
taggings_json_file = MachineLearning::DATA_FOLDER.join(taggings_filename) taggings_json_file = MachineLearning.data_folder.join(taggings_filename)
expect(File).to receive(:read).with(taggings_json_file).and_return taggings_data.to_json expect(File).to receive(:read).with(taggings_json_file).and_return taggings_data.to_json
machine_learning.send(:import_ml_investments_tags) machine_learning.send(:import_ml_investments_tags)

View File

@@ -207,7 +207,7 @@ describe "Machine learning" do
end end
scenario "Show output files info on settins page" do scenario "Show output files info on settins page" do
FileUtils.mkdir_p MachineLearning::DATA_FOLDER FileUtils.mkdir_p MachineLearning.data_folder
allow_any_instance_of(MachineLearning).to receive(:run) do allow_any_instance_of(MachineLearning).to receive(:run) do
MachineLearningJob.first.update!(finished_at: 2.minutes.from_now) MachineLearningJob.first.update!(finished_at: 2.minutes.from_now)
@@ -215,9 +215,9 @@ describe "Machine learning" do
script: "proposals_summary_comments_textrank.py", script: "proposals_summary_comments_textrank.py",
kind: "comments_summary", kind: "comments_summary",
updated_at: 2.minutes.from_now) updated_at: 2.minutes.from_now)
comments_file = MachineLearning::DATA_FOLDER.join(MachineLearning.comments_filename) comments_file = MachineLearning.data_folder.join(MachineLearning.comments_filename)
File.write(comments_file, [].to_json) File.write(comments_file, [].to_json)
proposals_comments_summary_file = MachineLearning::DATA_FOLDER.join(MachineLearning.proposals_comments_summary_filename) proposals_comments_summary_file = MachineLearning.data_folder.join(MachineLearning.proposals_comments_summary_filename)
File.write(proposals_comments_summary_file, [].to_json) File.write(proposals_comments_summary_file, [].to_json)
end end