From 31011033a7f5c36e3a5eba69e561149581a9e678 Mon Sep 17 00:00:00 2001 From: taitus Date: Fri, 25 Jan 2019 14:04:38 +0100 Subject: [PATCH] Create MicrosoftTranslateClient - Conect to remote translation service and translate array of strings - Create SentencesParser module with texts management methods: - Detect split position method: When the text requested to translate is too large, we need split it in smaller parts for we can translate. This method search first valid point (dot or whitespace) for split this text so then we can get an response without dividing the word in half. --- lib/microsoft_translate_client.rb | 81 ++++++++++ lib/sentences_parser.rb | 24 +++ spec/lib/microsoft_translate_client_spec.rb | 168 ++++++++++++++++++++ 3 files changed, 273 insertions(+) create mode 100644 lib/microsoft_translate_client.rb create mode 100644 lib/sentences_parser.rb create mode 100644 spec/lib/microsoft_translate_client_spec.rb diff --git a/lib/microsoft_translate_client.rb b/lib/microsoft_translate_client.rb new file mode 100644 index 000000000..23b595403 --- /dev/null +++ b/lib/microsoft_translate_client.rb @@ -0,0 +1,81 @@ +require "translator-text" +include SentencesParser + +class MicrosoftTranslateClient + CHARACTERS_LIMIT_PER_REQUEST = 5000 + PREVENTING_TRANSLATION_KEY = "notranslate" + + def initialize + api_key = Rails.application.secrets.microsoft_api_key + @client = TranslatorText::Client.new(api_key) + end + + def call(fields_values, locale) + texts = prepare_texts(fields_values) + valid_locale = parse_locale(locale) + request_translation(texts, valid_locale) + end + + def fragments_for(text) + return [text] if text.size <= CHARACTERS_LIMIT_PER_REQUEST + + split_position = detect_split_position(text) + start_text = text[0..split_position] + end_text = text[split_position + 1 .. text.size] + + fragments_for(start_text) + [end_text] + end + + private + + def request_translation(texts, locale) + response = [] + split_response = false + + if characters_count(texts) <= CHARACTERS_LIMIT_PER_REQUEST + response = @client.translate(texts, to: locale) + else + texts.each do |text| + response << translate_text(text, locale) + end + split_response = true + end + + parse_response(response, split_response) + end + + def translate_text(text, locale) + fragments_for(text).map do |fragment| + @client.translate([fragment], to: locale) + end.flatten + end + + def parse_response(response, split_response) + response.map do |object| + if split_response + build_translation(object) + else + get_field_value(object) + end + end + end + + def build_translation(objects) + objects.map { |object| get_field_value(object) }.join + end + + def get_field_value(object) + text = object.translations[0].text + notranslate?(text) ? nil : text + end + + def prepare_texts(texts) + texts.map { |text| text || PREVENTING_TRANSLATION_KEY } + #https://docs.microsoft.com/es-es/azure/cognitive-services/translator/prevent-translation + end + + def notranslate?(text) + text.downcase == PREVENTING_TRANSLATION_KEY + end + +end diff --git a/lib/sentences_parser.rb b/lib/sentences_parser.rb new file mode 100644 index 000000000..890a6eb57 --- /dev/null +++ b/lib/sentences_parser.rb @@ -0,0 +1,24 @@ +module SentencesParser + + def detect_split_position(text) + minimum_valid_index = text.size - MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST + valid_point = text[minimum_valid_index..text.size].index(".") + valid_whitespace = text[minimum_valid_index..text.size].index(" ") + + get_split_position(valid_point, valid_whitespace, minimum_valid_index) + end + + def get_split_position(valid_point, valid_whitespace, minimum_valid_index) + split_position = minimum_valid_index + if valid_point.present? || valid_whitespace.present? + valid_position = valid_point.present? ? valid_point : valid_whitespace + split_position = split_position + valid_position + end + split_position + end + + def characters_count(texts) + texts.map(&:size).reduce(:+) + end + +end diff --git a/spec/lib/microsoft_translate_client_spec.rb b/spec/lib/microsoft_translate_client_spec.rb new file mode 100644 index 000000000..e93203502 --- /dev/null +++ b/spec/lib/microsoft_translate_client_spec.rb @@ -0,0 +1,168 @@ +require "rails_helper" + +describe MicrosoftTranslateClient do + + let(:microsoft_client) { described_class.new } + + describe "#call" do + + context "when characters from request are less than the characters limit" do + it "response has the expected result" do + response = create_response("Nuevo título", "Nueva descripción") + + expect_any_instance_of(TranslatorText::Client).to receive(:translate).and_return(response) + + result = microsoft_client.call([ "New title", "New description"], :es) + + expect(result).to eq(["Nuevo título", "Nueva descripción"]) + end + + it "response nil has the expected result when request has nil value" do + response = create_response("Notranslate", "Nueva descripción") + + expect_any_instance_of(TranslatorText::Client).to receive(:translate).and_return(response) + + result = microsoft_client.call([nil, "New description"], :es) + + expect(result).to eq([nil, "Nueva descripción"]) + end + + end + + context "when characters from request are greater than characters limit" do + it "response has the expected result when the request has 2 texts, where both less than CHARACTERS_LIMIT_PER_REQUEST" do + stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20) + text_en = Faker::Lorem.characters(11) + another_text_en = Faker::Lorem.characters(11) + + translated_text_es = Faker::Lorem.characters(11) + another_translated_text_es = Faker::Lorem.characters(11) + response_text = create_response(translated_text_es) + response_another_text = create_response(another_translated_text_es) + + expect_any_instance_of(TranslatorText::Client).to receive(:translate).exactly(1) + .times + .and_return(response_text) + expect_any_instance_of(TranslatorText::Client).to receive(:translate).exactly(1) + .times + .and_return(response_another_text) + + result = microsoft_client.call([text_en, another_text_en], :es) + + expect(result).to eq([translated_text_es, another_translated_text_es]) + end + + it "response has the expected result when the request has 2 texts and both are greater than CHARACTERS_LIMIT_PER_REQUEST" do + stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20) + start_text_en = Faker::Lorem.characters(10) + " " + end_text_en = Faker::Lorem.characters(10) + text_en = start_text_en + end_text_en + + start_translated_text_es = Faker::Lorem.characters(10) + " " + end_translated_text_es = Faker::Lorem.characters(10) + translated_text_es = start_translated_text_es + end_translated_text_es + response_start_text = create_response(start_translated_text_es) + response_end_text = create_response(end_translated_text_es) + + expect_any_instance_of(TranslatorText::Client).to receive(:translate).with([start_text_en], to: :es) + .exactly(1) + .times + .and_return(response_start_text) + expect_any_instance_of(TranslatorText::Client).to receive(:translate).with([end_text_en], to: :es) + .exactly(1) + .times + .and_return(response_end_text) + + start_another_text_en = Faker::Lorem.characters(12) + "." + end_another_text_en = Faker::Lorem.characters(12) + another_text_en = start_another_text_en + end_another_text_en + + another_start_translated_text_es = Faker::Lorem.characters(12) + "." + another_end_translated_text_es = Faker::Lorem.characters(12) + another_translated_text_es = another_start_translated_text_es + another_end_translated_text_es + response_another_start_text = create_response(another_start_translated_text_es) + response_another_end_text = create_response(another_end_translated_text_es) + + expect_any_instance_of(TranslatorText::Client).to receive(:translate).with([start_another_text_en], to: :es) + .exactly(1) + .times + .and_return(response_another_start_text) + expect_any_instance_of(TranslatorText::Client).to receive(:translate).with([end_another_text_en], to: :es) + .exactly(1) + .times + .and_return(response_another_end_text) + + result = microsoft_client.call([text_en, another_text_en], :es) + + expect(result).to eq([translated_text_es, another_translated_text_es]) + end + + end + + end + + describe "#detect_split_position" do + + context "text has less characters than characters limit" do + it "does not split the text" do + stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20) + text_to_translate = Faker::Lorem.characters(10) + + result = microsoft_client.fragments_for(text_to_translate) + + expect(result).to eq [text_to_translate] + end + end + + context "text has more characters than characters limit" do + it "to split text by first valid dot when there is a dot for split" do + stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20) + start_text = Faker::Lorem.characters(10) + "." + end_text = Faker::Lorem.characters(10) + text_to_translate = start_text + end_text + + result = microsoft_client.fragments_for(text_to_translate) + + expect(result).to eq([start_text, end_text]) + end + + it "to split text by first valid space when there is not a dot for split but there is a space" do + stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20) + start_text = Faker::Lorem.characters(10) + " " + end_text = Faker::Lorem.characters(10) + text_to_translate = start_text + end_text + + result = microsoft_client.fragments_for(text_to_translate) + + expect(result).to eq([start_text, end_text]) + end + + it "to split text in the middle of a word when there are not valid dots and spaces" do + stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 40) + sub_part_text_1 = Faker::Lorem.characters(5) + " ." + sub_part_text_2 = Faker::Lorem.characters(5) + sub_part_text_3 = Faker::Lorem.characters(9) + sub_part_text_4 = Faker::Lorem.characters(30) + text_to_translate = sub_part_text_1 + sub_part_text_2 + sub_part_text_3 + sub_part_text_4 + + result = microsoft_client.fragments_for(text_to_translate) + + expect(result).to eq([sub_part_text_1 + sub_part_text_2, sub_part_text_3 + sub_part_text_4]) + end + + end + end +end + +def create_response(*args) + # response = [#] detectedLanguage={"language"=>"en", "score"=>1.0}>, #] detectedLanguage={"language"=>"en", "score"=>1.0}>] + translations = Struct.new(:translations) + text = Struct.new(:text) + response = [] + + args.each do |text_to_response| + response << translations.new([text.new(text_to_response)]) + end + + response +end