Note we're excluding a few files: * Configuration files that weren't generated by us * Migration files that weren't generated by us * The Gemfile, since it includes an important comment that must be on the same line as the gem declaration * The Budget::Stats class, since the heading statistics are a mess and having shorter lines would require a lot of refactoring
481 lines
16 KiB
Ruby
481 lines
16 KiB
Ruby
class MachineLearning
|
|
attr_reader :user, :script, :previous_modified_date
|
|
attr_accessor :job
|
|
|
|
SCRIPTS_FOLDER = Rails.root.join("public", "machine_learning", "scripts").freeze
|
|
|
|
def initialize(job)
|
|
@job = job
|
|
@user = job.user
|
|
@previous_modified_date = set_previous_modified_date
|
|
end
|
|
|
|
def data_folder
|
|
self.class.data_folder
|
|
end
|
|
|
|
def run
|
|
begin
|
|
export_proposals_to_json
|
|
export_budget_investments_to_json
|
|
export_comments_to_json
|
|
|
|
return unless run_machine_learning_scripts
|
|
|
|
if updated_file?(MachineLearning.proposals_taggings_filename) &&
|
|
updated_file?(MachineLearning.proposals_tags_filename)
|
|
cleanup_proposals_tags!
|
|
import_ml_proposals_tags
|
|
update_machine_learning_info_for("tags")
|
|
end
|
|
|
|
if updated_file?(MachineLearning.investments_taggings_filename) &&
|
|
updated_file?(MachineLearning.investments_tags_filename)
|
|
cleanup_investments_tags!
|
|
import_ml_investments_tags
|
|
update_machine_learning_info_for("tags")
|
|
end
|
|
|
|
if updated_file?(MachineLearning.proposals_related_filename)
|
|
cleanup_proposals_related_content!
|
|
import_proposals_related_content
|
|
update_machine_learning_info_for("related_content")
|
|
end
|
|
|
|
if updated_file?(MachineLearning.investments_related_filename)
|
|
cleanup_investments_related_content!
|
|
import_budget_investments_related_content
|
|
update_machine_learning_info_for("related_content")
|
|
end
|
|
|
|
if updated_file?(MachineLearning.proposals_comments_summary_filename)
|
|
cleanup_proposals_comments_summary!
|
|
import_ml_proposals_comments_summary
|
|
update_machine_learning_info_for("comments_summary")
|
|
end
|
|
|
|
if updated_file?(MachineLearning.investments_comments_summary_filename)
|
|
cleanup_investments_comments_summary!
|
|
import_ml_investments_comments_summary
|
|
update_machine_learning_info_for("comments_summary")
|
|
end
|
|
|
|
job.update!(finished_at: Time.current)
|
|
Mailer.machine_learning_success(user).deliver_later
|
|
rescue Exception => error
|
|
handle_error(error)
|
|
raise error
|
|
end
|
|
end
|
|
handle_asynchronously :run, queue: "machine_learning"
|
|
|
|
class << self
|
|
def enabled?
|
|
Setting["feature.machine_learning"].present?
|
|
end
|
|
|
|
def proposals_filename
|
|
"proposals.json"
|
|
end
|
|
|
|
def investments_filename
|
|
"budget_investments.json"
|
|
end
|
|
|
|
def comments_filename
|
|
"comments.json"
|
|
end
|
|
|
|
def data_folder
|
|
Rails.root.join("public", tenant_data_folder)
|
|
end
|
|
|
|
def tenant_data_folder
|
|
Tenant.path_with_subfolder("machine_learning/data")
|
|
end
|
|
|
|
def data_output_files
|
|
files = { tags: [], related_content: [], comments_summary: [] }
|
|
|
|
if File.exist?(data_folder.join(proposals_tags_filename))
|
|
files[:tags] << proposals_tags_filename
|
|
end
|
|
if File.exist?(data_folder.join(proposals_taggings_filename))
|
|
files[:tags] << proposals_taggings_filename
|
|
end
|
|
if File.exist?(data_folder.join(investments_tags_filename))
|
|
files[:tags] << investments_tags_filename
|
|
end
|
|
if File.exist?(data_folder.join(investments_taggings_filename))
|
|
files[:tags] << investments_taggings_filename
|
|
end
|
|
|
|
if File.exist?(data_folder.join(proposals_related_filename))
|
|
files[:related_content] << proposals_related_filename
|
|
end
|
|
if File.exist?(data_folder.join(investments_related_filename))
|
|
files[:related_content] << investments_related_filename
|
|
end
|
|
|
|
if File.exist?(data_folder.join(proposals_comments_summary_filename))
|
|
files[:comments_summary] << proposals_comments_summary_filename
|
|
end
|
|
if File.exist?(data_folder.join(investments_comments_summary_filename))
|
|
files[:comments_summary] << investments_comments_summary_filename
|
|
end
|
|
|
|
files
|
|
end
|
|
|
|
def data_intermediate_files
|
|
excluded = [
|
|
proposals_filename,
|
|
investments_filename,
|
|
comments_filename,
|
|
proposals_tags_filename,
|
|
proposals_taggings_filename,
|
|
investments_tags_filename,
|
|
investments_taggings_filename,
|
|
proposals_related_filename,
|
|
investments_related_filename,
|
|
proposals_comments_summary_filename,
|
|
investments_comments_summary_filename
|
|
]
|
|
json = Dir[data_folder.join("*.json")].map do |full_path_filename|
|
|
full_path_filename.split("/").last
|
|
end
|
|
csv = Dir[data_folder.join("*.csv")].map do |full_path_filename|
|
|
full_path_filename.split("/").last
|
|
end
|
|
(json + csv - excluded).sort
|
|
end
|
|
|
|
def proposals_tags_filename
|
|
"ml_tags_proposals.json"
|
|
end
|
|
|
|
def proposals_taggings_filename
|
|
"ml_taggings_proposals.json"
|
|
end
|
|
|
|
def investments_tags_filename
|
|
"ml_tags_budgets.json"
|
|
end
|
|
|
|
def investments_taggings_filename
|
|
"ml_taggings_budgets.json"
|
|
end
|
|
|
|
def proposals_related_filename
|
|
"ml_related_content_proposals.json"
|
|
end
|
|
|
|
def investments_related_filename
|
|
"ml_related_content_budgets.json"
|
|
end
|
|
|
|
def proposals_comments_summary_filename
|
|
"ml_comments_summaries_proposals.json"
|
|
end
|
|
|
|
def investments_comments_summary_filename
|
|
"ml_comments_summaries_budgets.json"
|
|
end
|
|
|
|
def data_path(filename)
|
|
"/#{tenant_data_folder}/#{filename}"
|
|
end
|
|
|
|
def script_kinds
|
|
%w[tags related_content comments_summary]
|
|
end
|
|
|
|
def scripts_info
|
|
scripts_info = []
|
|
Dir[SCRIPTS_FOLDER.join("*.py")].each do |full_path_filename|
|
|
scripts_info << {
|
|
name: full_path_filename.split("/").last,
|
|
description: description_from(full_path_filename)
|
|
}
|
|
end
|
|
scripts_info.sort_by { |script_info| script_info[:name] }
|
|
end
|
|
|
|
def description_from(script_filename)
|
|
description = ""
|
|
delimiter = '"""'
|
|
break_line = "<br>"
|
|
comment_found = false
|
|
File.readlines(script_filename).each do |line|
|
|
if line.start_with?(delimiter) && !comment_found
|
|
comment_found = true
|
|
line.slice!(delimiter)
|
|
description << line.strip.concat(break_line) if line.present?
|
|
elsif line.include?(delimiter)
|
|
line.slice!(delimiter)
|
|
description << line.strip if line.present?
|
|
break
|
|
elsif comment_found
|
|
description << line.strip.concat(break_line)
|
|
end
|
|
end
|
|
|
|
description.delete_suffix(break_line)
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
def create_data_folder
|
|
FileUtils.mkdir_p data_folder
|
|
end
|
|
|
|
def export_proposals_to_json
|
|
create_data_folder
|
|
filename = data_folder.join(MachineLearning.proposals_filename)
|
|
Proposal::Exporter.new.to_json_file(filename)
|
|
end
|
|
|
|
def export_budget_investments_to_json
|
|
create_data_folder
|
|
filename = data_folder.join(MachineLearning.investments_filename)
|
|
Budget::Investment::Exporter.new(Array.new).to_json_file(filename)
|
|
end
|
|
|
|
def export_comments_to_json
|
|
create_data_folder
|
|
filename = data_folder.join(MachineLearning.comments_filename)
|
|
Comment::Exporter.new.to_json_file(filename)
|
|
end
|
|
|
|
def run_machine_learning_scripts
|
|
command = if Tenant.default?
|
|
"python #{job.script}"
|
|
else
|
|
"CONSUL_TENANT=#{Tenant.current_schema} python #{job.script}"
|
|
end
|
|
|
|
output = `cd #{SCRIPTS_FOLDER} && #{command} 2>&1`
|
|
result = $?.success?
|
|
if result == false
|
|
job.update!(finished_at: Time.current, error: output)
|
|
Mailer.machine_learning_error(user).deliver_later
|
|
end
|
|
result
|
|
end
|
|
|
|
def cleanup_proposals_tags!
|
|
Tagging.where(context: "ml_tags", taggable_type: "Proposal").find_each(&:destroy!)
|
|
Tag.find_each { |tag| tag.destroy! if Tagging.where(tag: tag).empty? }
|
|
end
|
|
|
|
def cleanup_investments_tags!
|
|
Tagging.where(context: "ml_tags", taggable_type: "Budget::Investment").find_each(&:destroy!)
|
|
Tag.find_each { |tag| tag.destroy! if Tagging.where(tag: tag).empty? }
|
|
end
|
|
|
|
def cleanup_proposals_related_content!
|
|
RelatedContent.with_hidden.for_proposals.from_machine_learning.find_each(&:really_destroy!)
|
|
end
|
|
|
|
def cleanup_investments_related_content!
|
|
RelatedContent.with_hidden.for_investments.from_machine_learning.find_each(&:really_destroy!)
|
|
end
|
|
|
|
def cleanup_proposals_comments_summary!
|
|
MlSummaryComment.where(commentable_type: "Proposal").find_each(&:destroy!)
|
|
end
|
|
|
|
def cleanup_investments_comments_summary!
|
|
MlSummaryComment.where(commentable_type: "Budget::Investment").find_each(&:destroy!)
|
|
end
|
|
|
|
def import_ml_proposals_comments_summary
|
|
json_file = data_folder.join(MachineLearning.proposals_comments_summary_filename)
|
|
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
|
|
json_data.each do |attributes|
|
|
attributes.delete(:id)
|
|
unless MlSummaryComment.find_by(commentable_id: attributes[:commentable_id],
|
|
commentable_type: "Proposal")
|
|
MlSummaryComment.create!(attributes)
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_ml_investments_comments_summary
|
|
json_file = data_folder.join(MachineLearning.investments_comments_summary_filename)
|
|
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
|
|
json_data.each do |attributes|
|
|
attributes.delete(:id)
|
|
unless MlSummaryComment.find_by(commentable_id: attributes[:commentable_id],
|
|
commentable_type: "Budget::Investment")
|
|
MlSummaryComment.create!(attributes)
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_proposals_related_content
|
|
json_file = data_folder.join(MachineLearning.proposals_related_filename)
|
|
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
|
|
json_data.each do |related|
|
|
id = related.delete(:id)
|
|
score = related.size
|
|
related.each do |_, related_id|
|
|
if related_id.present?
|
|
attributes = {
|
|
parent_relationable_id: id,
|
|
parent_relationable_type: "Proposal",
|
|
child_relationable_id: related_id,
|
|
child_relationable_type: "Proposal"
|
|
}
|
|
related_content = RelatedContent.find_by(attributes)
|
|
if related_content.present?
|
|
related_content.update!(machine_learning_score: score)
|
|
else
|
|
RelatedContent.create!(attributes.merge(machine_learning: true,
|
|
author: user,
|
|
machine_learning_score: score))
|
|
end
|
|
end
|
|
score -= 1
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_budget_investments_related_content
|
|
json_file = data_folder.join(MachineLearning.investments_related_filename)
|
|
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
|
|
json_data.each do |related|
|
|
id = related.delete(:id)
|
|
score = related.size
|
|
related.each do |_, related_id|
|
|
if related_id.present?
|
|
attributes = {
|
|
parent_relationable_id: id,
|
|
parent_relationable_type: "Budget::Investment",
|
|
child_relationable_id: related_id,
|
|
child_relationable_type: "Budget::Investment"
|
|
}
|
|
related_content = RelatedContent.find_by(attributes)
|
|
if related_content.present?
|
|
related_content.update!(machine_learning_score: score)
|
|
else
|
|
RelatedContent.create!(attributes.merge(machine_learning: true,
|
|
author: user,
|
|
machine_learning_score: score))
|
|
end
|
|
end
|
|
score -= 1
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_ml_proposals_tags
|
|
ids = {}
|
|
json_file = data_folder.join(MachineLearning.proposals_tags_filename)
|
|
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
|
|
json_data.each do |attributes|
|
|
if attributes[:name].present?
|
|
attributes.delete(:taggings_count)
|
|
if attributes[:name].length >= 150
|
|
attributes[:name] = attributes[:name].truncate(150)
|
|
end
|
|
tag = Tag.find_or_create_by!(name: attributes[:name])
|
|
ids[attributes[:id]] = tag.id
|
|
end
|
|
end
|
|
|
|
json_file = data_folder.join(MachineLearning.proposals_taggings_filename)
|
|
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
|
|
json_data.each do |attributes|
|
|
if attributes[:tag_id].present?
|
|
tag_id = ids[attributes[:tag_id]]
|
|
if Tag.find_by(id: tag_id) && attributes[:taggable_id].present?
|
|
attributes[:tag_id] = tag_id
|
|
attributes[:taggable_type] = "Proposal"
|
|
attributes[:context] = "ml_tags"
|
|
Tagging.create!(attributes)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def import_ml_investments_tags
|
|
ids = {}
|
|
json_file = data_folder.join(MachineLearning.investments_tags_filename)
|
|
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
|
|
json_data.each do |attributes|
|
|
if attributes[:name].present?
|
|
attributes.delete(:taggings_count)
|
|
if attributes[:name].length >= 150
|
|
attributes[:name] = attributes[:name].truncate(150)
|
|
end
|
|
tag = Tag.find_or_create_by!(name: attributes[:name])
|
|
ids[attributes[:id]] = tag.id
|
|
end
|
|
end
|
|
|
|
json_file = data_folder.join(MachineLearning.investments_taggings_filename)
|
|
json_data = JSON.parse(File.read(json_file)).each(&:deep_symbolize_keys!)
|
|
json_data.each do |attributes|
|
|
if attributes[:tag_id].present?
|
|
tag_id = ids[attributes[:tag_id]]
|
|
if Tag.find_by(id: tag_id) && attributes[:taggable_id].present?
|
|
attributes[:tag_id] = tag_id
|
|
attributes[:taggable_type] = "Budget::Investment"
|
|
attributes[:context] = "ml_tags"
|
|
Tagging.create!(attributes)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def update_machine_learning_info_for(kind)
|
|
MachineLearningInfo.find_or_create_by!(kind: kind)
|
|
.update!(generated_at: job.started_at, script: job.script)
|
|
end
|
|
|
|
def set_previous_modified_date
|
|
proposals_tags_filename = MachineLearning.proposals_tags_filename
|
|
proposals_taggings_filename = MachineLearning.proposals_taggings_filename
|
|
investments_tags_filename = MachineLearning.investments_tags_filename
|
|
investments_taggings_filename = MachineLearning.investments_taggings_filename
|
|
proposals_related_filename = MachineLearning.proposals_related_filename
|
|
investments_related_filename = MachineLearning.investments_related_filename
|
|
proposals_comments_summary_filename = MachineLearning.proposals_comments_summary_filename
|
|
investments_comments_summary_filename = MachineLearning.investments_comments_summary_filename
|
|
|
|
{
|
|
proposals_tags_filename => last_modified_date_for(proposals_tags_filename),
|
|
proposals_taggings_filename => last_modified_date_for(proposals_taggings_filename),
|
|
investments_tags_filename => last_modified_date_for(investments_tags_filename),
|
|
investments_taggings_filename => last_modified_date_for(investments_taggings_filename),
|
|
proposals_related_filename => last_modified_date_for(proposals_related_filename),
|
|
investments_related_filename => last_modified_date_for(investments_related_filename),
|
|
proposals_comments_summary_filename => last_modified_date_for(proposals_comments_summary_filename),
|
|
investments_comments_summary_filename => last_modified_date_for(investments_comments_summary_filename)
|
|
}
|
|
end
|
|
|
|
def last_modified_date_for(filename)
|
|
return nil unless File.exist? data_folder.join(filename)
|
|
|
|
File.mtime data_folder.join(filename)
|
|
end
|
|
|
|
def updated_file?(filename)
|
|
return false unless File.exist? data_folder.join(filename)
|
|
return true unless previous_modified_date[filename].present?
|
|
|
|
last_modified_date_for(filename) > previous_modified_date[filename]
|
|
end
|
|
|
|
def handle_error(error)
|
|
message = error.message
|
|
backtrace = error.backtrace.select { |line| line.include?("machine_learning.rb") }
|
|
full_error = ([message] + backtrace).join("<br>")
|
|
job.update!(finished_at: Time.current, error: full_error)
|
|
Mailer.machine_learning_error(user).deliver_later
|
|
end
|
|
end
|