From cf9118c5ac6e8f3b917f5be9650ac133c6046ccc Mon Sep 17 00:00:00 2001 From: Thelonius Kort Date: Wed, 11 Jan 2023 22:15:04 +0100 Subject: [PATCH] Fix issue with end of sentence 'disguised' by markup --- .../internal_tree/raw_internal_basic.ex | 19 ++++- test/outlook/internaltree_test.exs | 72 +++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/lib/outlook/internal_tree/raw_internal_basic.ex b/lib/outlook/internal_tree/raw_internal_basic.ex index 178a64e..051e4b4 100644 --- a/lib/outlook/internal_tree/raw_internal_basic.ex +++ b/lib/outlook/internal_tree/raw_internal_basic.ex @@ -62,6 +62,7 @@ defmodule Outlook.InternalTree.RawInternalBasic do } end ) + |> strip_empty_tunits() end def partition_inlinelevel([ %InternalNode{type: :element} = node | rest ]) do @@ -73,7 +74,7 @@ defmodule Outlook.InternalTree.RawInternalBasic do def partition_inlinelevel([ %InternalNode{type: :text} = textnode | rest ]) do content = if String.contains?(textnode.content, @splitmarker) do - String.split(textnode.content, @splitmarker, trim: true) + String.split(textnode.content, @splitmarker, trim: false) |> Enum.map(fn cont -> %InternalNode{textnode | content: cont} end) else textnode @@ -89,6 +90,22 @@ defmodule Outlook.InternalTree.RawInternalBasic do def partition_inlinelevel([]), do: [] + def strip_empty_tunits([ %TranslationUnit{content: ""} | rest]) do + strip_empty_tunits(rest) + end + + def strip_empty_tunits([ %{type: :element} = node | rest]) do + [ %InternalNode{ node | content: strip_empty_tunits(node.content) } + | strip_empty_tunits(rest) ] + end + + def strip_empty_tunits([ node | rest]) do + [ node | strip_empty_tunits(rest) ] + end + + def strip_empty_tunits([]), do: [] + + @doc """ iex> chunk_with_list([1, 1, [2, 2], 3, 3, [4, 4, 4], 5, 5]) [[1, 1, 2], [2, 3, 3, 4], [4], [4, 5, 5]] diff --git a/test/outlook/internaltree_test.exs b/test/outlook/internaltree_test.exs index e902766..a1ee710 100644 --- a/test/outlook/internaltree_test.exs +++ b/test/outlook/internaltree_test.exs @@ -133,5 +133,77 @@ defmodule Outlook.InternalTreeTest do } ] end + + test "partition when end of sentence is 'disguised' by some markup" do + tree = [ + %Outlook.InternalTree.InternalNode{ + name: "p", + attributes: %{}, + type: :element, + nid: "oaRwUH3A2wMF", + content: [ + %Outlook.InternalTree.InternalNode{ + name: "", + attributes: %{}, + type: :text, + nid: "xep6gWMVWF1D", + content: "This Fit for 55 is the first time in the world that a group of countries, the EU, officially imposes an agenda to force an absurd “Zero” CO2 by 2050 and 55% less CO2 by 2030. EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for ", + eph: %{sibling_with: :inline} + }, + %Outlook.InternalTree.InternalNode{ + name: "a", + attributes: %{ + href: "https://www.politico.eu/article/fit-for-55-eu-5-things-to-know/" + }, + type: :element, + nid: "qxCrs0csHDLI", + content: [ + %Outlook.InternalTree.InternalNode{ + name: "", + attributes: %{}, + type: :text, + nid: "2WwtRNKMc8Sp", + content: "clean transport.”", + eph: %{sibling_with: :inline} + } + ], + eph: %{sibling_with: :inline} + }, + %Outlook.InternalTree.InternalNode{ + name: "", + attributes: %{}, + type: :text, + nid: "3CKpLvIywr8G", + content: " In reality it will destroy the transport industry, steel, cement as well as coal and gas fuel electric generation. ", + eph: %{sibling_with: :inline} + } + ], + eph: %{sibling_with: :block} + } + ] + assert InternalTree.partition_text(tree) |> unify_nids_in_tunits() == [ + %Outlook.InternalTree.InternalNode{ + name: "p", + attributes: %{}, + type: :element, + nid: "oaRwUH3A2wMF", + content: [ + %Outlook.InternalTree.TranslationUnit{ + status: :untranslated, + nid: "xxxxxx", + content: "This Fit for 55 is the first time in the world that a group of countries, the EU, officially imposes an agenda to force an absurd “Zero” CO2 by 2050 and 55% less CO2 by 2030. EU Green Deal czar, Commissioner Frans Timmermans said in May, “We will strengthen the EU Emissions Trading System, update the Energy Taxation Directive, and propose new CO2 standards for cars, new energy efficiency standards for buildings, new targets for renewables, and new ways of supporting clean fuels and infrastructure for clean transport.”", + eph: %{} + }, + %Outlook.InternalTree.TranslationUnit{ + status: :untranslated, + nid: "xxxxxx", + content: " In reality it will destroy the transport industry, steel, cement as well as coal and gas fuel electric generation. ", + eph: %{} + } + ], + eph: %{sibling_with: :block} + } + ] + end end end