Create MicrosoftTranslateClient

- Conect to remote translation service and translate array of strings
- Create SentencesParser module with texts management methods:
  - Detect split position method: When the text requested to translate is too large, we need split it in smaller parts for we can translate. This method search first valid point (dot or whitespace) for split this text so then we can get an response without dividing the word in half.
This commit is contained in:
taitus
2019-01-25 14:04:38 +01:00
committed by voodoorai2000
parent fa80d96249
commit 31011033a7
3 changed files with 273 additions and 0 deletions

View File

@@ -0,0 +1,81 @@
require "translator-text"
include SentencesParser
class MicrosoftTranslateClient
CHARACTERS_LIMIT_PER_REQUEST = 5000
PREVENTING_TRANSLATION_KEY = "notranslate"
def initialize
api_key = Rails.application.secrets.microsoft_api_key
@client = TranslatorText::Client.new(api_key)
end
def call(fields_values, locale)
texts = prepare_texts(fields_values)
valid_locale = parse_locale(locale)
request_translation(texts, valid_locale)
end
def fragments_for(text)
return [text] if text.size <= CHARACTERS_LIMIT_PER_REQUEST
split_position = detect_split_position(text)
start_text = text[0..split_position]
end_text = text[split_position + 1 .. text.size]
fragments_for(start_text) + [end_text]
end
private
def request_translation(texts, locale)
response = []
split_response = false
if characters_count(texts) <= CHARACTERS_LIMIT_PER_REQUEST
response = @client.translate(texts, to: locale)
else
texts.each do |text|
response << translate_text(text, locale)
end
split_response = true
end
parse_response(response, split_response)
end
def translate_text(text, locale)
fragments_for(text).map do |fragment|
@client.translate([fragment], to: locale)
end.flatten
end
def parse_response(response, split_response)
response.map do |object|
if split_response
build_translation(object)
else
get_field_value(object)
end
end
end
def build_translation(objects)
objects.map { |object| get_field_value(object) }.join
end
def get_field_value(object)
text = object.translations[0].text
notranslate?(text) ? nil : text
end
def prepare_texts(texts)
texts.map { |text| text || PREVENTING_TRANSLATION_KEY }
#https://docs.microsoft.com/es-es/azure/cognitive-services/translator/prevent-translation
end
def notranslate?(text)
text.downcase == PREVENTING_TRANSLATION_KEY
end
end

24
lib/sentences_parser.rb Normal file
View File

@@ -0,0 +1,24 @@
module SentencesParser
def detect_split_position(text)
minimum_valid_index = text.size - MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST
valid_point = text[minimum_valid_index..text.size].index(".")
valid_whitespace = text[minimum_valid_index..text.size].index(" ")
get_split_position(valid_point, valid_whitespace, minimum_valid_index)
end
def get_split_position(valid_point, valid_whitespace, minimum_valid_index)
split_position = minimum_valid_index
if valid_point.present? || valid_whitespace.present?
valid_position = valid_point.present? ? valid_point : valid_whitespace
split_position = split_position + valid_position
end
split_position
end
def characters_count(texts)
texts.map(&:size).reduce(:+)
end
end

View File

@@ -0,0 +1,168 @@
require "rails_helper"
describe MicrosoftTranslateClient do
let(:microsoft_client) { described_class.new }
describe "#call" do
context "when characters from request are less than the characters limit" do
it "response has the expected result" do
response = create_response("Nuevo título", "Nueva descripción")
expect_any_instance_of(TranslatorText::Client).to receive(:translate).and_return(response)
result = microsoft_client.call([ "New title", "New description"], :es)
expect(result).to eq(["Nuevo título", "Nueva descripción"])
end
it "response nil has the expected result when request has nil value" do
response = create_response("Notranslate", "Nueva descripción")
expect_any_instance_of(TranslatorText::Client).to receive(:translate).and_return(response)
result = microsoft_client.call([nil, "New description"], :es)
expect(result).to eq([nil, "Nueva descripción"])
end
end
context "when characters from request are greater than characters limit" do
it "response has the expected result when the request has 2 texts, where both less than CHARACTERS_LIMIT_PER_REQUEST" do
stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20)
text_en = Faker::Lorem.characters(11)
another_text_en = Faker::Lorem.characters(11)
translated_text_es = Faker::Lorem.characters(11)
another_translated_text_es = Faker::Lorem.characters(11)
response_text = create_response(translated_text_es)
response_another_text = create_response(another_translated_text_es)
expect_any_instance_of(TranslatorText::Client).to receive(:translate).exactly(1)
.times
.and_return(response_text)
expect_any_instance_of(TranslatorText::Client).to receive(:translate).exactly(1)
.times
.and_return(response_another_text)
result = microsoft_client.call([text_en, another_text_en], :es)
expect(result).to eq([translated_text_es, another_translated_text_es])
end
it "response has the expected result when the request has 2 texts and both are greater than CHARACTERS_LIMIT_PER_REQUEST" do
stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20)
start_text_en = Faker::Lorem.characters(10) + " "
end_text_en = Faker::Lorem.characters(10)
text_en = start_text_en + end_text_en
start_translated_text_es = Faker::Lorem.characters(10) + " "
end_translated_text_es = Faker::Lorem.characters(10)
translated_text_es = start_translated_text_es + end_translated_text_es
response_start_text = create_response(start_translated_text_es)
response_end_text = create_response(end_translated_text_es)
expect_any_instance_of(TranslatorText::Client).to receive(:translate).with([start_text_en], to: :es)
.exactly(1)
.times
.and_return(response_start_text)
expect_any_instance_of(TranslatorText::Client).to receive(:translate).with([end_text_en], to: :es)
.exactly(1)
.times
.and_return(response_end_text)
start_another_text_en = Faker::Lorem.characters(12) + "."
end_another_text_en = Faker::Lorem.characters(12)
another_text_en = start_another_text_en + end_another_text_en
another_start_translated_text_es = Faker::Lorem.characters(12) + "."
another_end_translated_text_es = Faker::Lorem.characters(12)
another_translated_text_es = another_start_translated_text_es + another_end_translated_text_es
response_another_start_text = create_response(another_start_translated_text_es)
response_another_end_text = create_response(another_end_translated_text_es)
expect_any_instance_of(TranslatorText::Client).to receive(:translate).with([start_another_text_en], to: :es)
.exactly(1)
.times
.and_return(response_another_start_text)
expect_any_instance_of(TranslatorText::Client).to receive(:translate).with([end_another_text_en], to: :es)
.exactly(1)
.times
.and_return(response_another_end_text)
result = microsoft_client.call([text_en, another_text_en], :es)
expect(result).to eq([translated_text_es, another_translated_text_es])
end
end
end
describe "#detect_split_position" do
context "text has less characters than characters limit" do
it "does not split the text" do
stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20)
text_to_translate = Faker::Lorem.characters(10)
result = microsoft_client.fragments_for(text_to_translate)
expect(result).to eq [text_to_translate]
end
end
context "text has more characters than characters limit" do
it "to split text by first valid dot when there is a dot for split" do
stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20)
start_text = Faker::Lorem.characters(10) + "."
end_text = Faker::Lorem.characters(10)
text_to_translate = start_text + end_text
result = microsoft_client.fragments_for(text_to_translate)
expect(result).to eq([start_text, end_text])
end
it "to split text by first valid space when there is not a dot for split but there is a space" do
stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 20)
start_text = Faker::Lorem.characters(10) + " "
end_text = Faker::Lorem.characters(10)
text_to_translate = start_text + end_text
result = microsoft_client.fragments_for(text_to_translate)
expect(result).to eq([start_text, end_text])
end
it "to split text in the middle of a word when there are not valid dots and spaces" do
stub_const("MicrosoftTranslateClient::CHARACTERS_LIMIT_PER_REQUEST", 40)
sub_part_text_1 = Faker::Lorem.characters(5) + " ."
sub_part_text_2 = Faker::Lorem.characters(5)
sub_part_text_3 = Faker::Lorem.characters(9)
sub_part_text_4 = Faker::Lorem.characters(30)
text_to_translate = sub_part_text_1 + sub_part_text_2 + sub_part_text_3 + sub_part_text_4
result = microsoft_client.fragments_for(text_to_translate)
expect(result).to eq([sub_part_text_1 + sub_part_text_2, sub_part_text_3 + sub_part_text_4])
end
end
end
end
def create_response(*args)
# response = [#<TranslatorText::Types::TranslationResult translations=[#<TranslatorText::Types::Translation text="Nuevo título" to=:es>] detectedLanguage={"language"=>"en", "score"=>1.0}>, #<TranslatorText::Types::TranslationResult translations=[#<TranslatorText::Types::Translation text="Nueva descripción" to=:es>] detectedLanguage={"language"=>"en", "score"=>1.0}>]
translations = Struct.new(:translations)
text = Struct.new(:text)
response = []
args.each do |text_to_response|
response << translations.new([text.new(text_to_response)])
end
response
end