Files
phoenix-ausblick/lib/outlook/internal_tree/raw_internal_basic.ex
2023-01-13 14:09:37 +01:00

162 lines
4.7 KiB
Elixir

defmodule Outlook.InternalTree.RawInternalBasic do
@moduledoc """
Function used for the raw_internal_tree which is a transitory state after importing
Html and before splitting textnodes into %TranslationUnit{}s.
"""
alias Outlook.InternalTree.InternalNode
alias Outlook.InternalTree.TranslationUnit
alias Outlook.InternalTree.Html
@splitmarker "@@translationunit@@"
@nonperiodmarker "@@nonperiod@@"
def set_split_markers([ %InternalNode{type: :text} = textnode | rest ]) do
[ %InternalNode{textnode |
content: textnode.content
|> String.replace(~r/([[:upper:]\d])\.(\d)?/u, "\\1#{@nonperiodmarker}\\2")
|> String.replace(~r|([.?!]["'”]?\s*)|u, "\\1#{@splitmarker}")
|> String.replace(@nonperiodmarker, ".")
} | set_split_markers(rest) ]
end
def set_split_markers([ %InternalNode{type: :element} = node | rest ]) do
[ %InternalNode{node | content: set_split_markers(node.content)}
| set_split_markers(rest) ]
end
def set_split_markers([ node | rest ]) do
[ node | set_split_markers(rest) ]
end
def set_split_markers([]), do: []
def partition_to_tunits(raw_tree) do
partition_blocklevel(raw_tree)
end
def partition_blocklevel([ %InternalNode{type: :element} = node | rest ]) do
[ %InternalNode{node | content: case get_sibling_collocation(node.content) do
:block -> partition_blocklevel(node.content)
:inline -> inline_to_translation_units(node.content)
nil -> node.content
end
} | partition_blocklevel(rest) ]
end
def partition_blocklevel([ node | rest ]) do
[ node | partition_blocklevel(rest) ]
end
def partition_blocklevel([]), do: []
defp inline_to_translation_units(inline_tree) do
partition_inlinelevel(inline_tree)
|> chunk_with_list()
|> Enum.map(fn sentence -> strip_empty_nodes(sentence) end)
|> Enum.map(fn sentence -> Html.strip_attributes(sentence) end)
|> Enum.map(fn sentence ->
%TranslationUnit{
content: Html.to_html(sentence),
status: :untranslated,
nid: Nanoid.generate()
}
end
)
|> strip_empty_tunits()
end
def partition_inlinelevel([ %InternalNode{type: :element} = node | rest ]) do
[ partition_inlinelevel(node.content)
|> chunk_with_list()
|> Enum.map(fn nodelist -> %InternalNode{node | content: nodelist} end)
| partition_inlinelevel(rest) ]
end
def partition_inlinelevel([ %InternalNode{type: :text} = textnode | rest ]) do
content = if String.contains?(textnode.content, @splitmarker) do
String.split(textnode.content, @splitmarker, trim: false)
|> Enum.map(fn cont -> %InternalNode{textnode | content: cont} end)
else
textnode
end
[ content
| partition_inlinelevel(rest) ]
end
def partition_inlinelevel([ node | rest ]) do
[ node | partition_inlinelevel(rest) ]
end
def partition_inlinelevel([]), do: []
def strip_empty_nodes([%{type: :element} = node | rest]) do
content = strip_empty_nodes(node.content)
case content do
[] -> strip_empty_nodes(rest)
_ -> [ %InternalNode{node | content: content} | strip_empty_nodes(rest) ]
end
end
def strip_empty_nodes([%{type: :text, content: ""} | rest]) do
strip_empty_nodes(rest)
end
def strip_empty_nodes([node | rest]) do
[ node | strip_empty_nodes(rest) ]
end
def strip_empty_nodes([]), do: []
def strip_empty_tunits([ %TranslationUnit{content: ""} | rest]) do
strip_empty_tunits(rest)
end
def strip_empty_tunits([ %{type: :element} = node | rest]) do
[ %InternalNode{ node | content: strip_empty_tunits(node.content) }
| strip_empty_tunits(rest) ]
end
def strip_empty_tunits([ node | rest]) do
[ node | strip_empty_tunits(rest) ]
end
def strip_empty_tunits([]), do: []
@doc """
iex> chunk_with_list([1, 1, [2, 2], 3, 3, [4, 4, 4], 5, 5])
[[1, 1, 2], [2, 3, 3, 4], [4], [4, 5, 5]]
iex> chunk_with_list([1, 1, [1, 2], 2, 2, [2, 3, 4], 4, 4])
[[1, 1, 1], [2, 2, 2, 2], [3], [4, 4, 4]]
"""
def chunk_with_list(list) do
chunk_fun = fn el, acc ->
if el do
{:cont, [el | acc]}
else
{:cont, Enum.reverse(acc), []}
end
end
after_fun = fn
[] -> {:cont, []}
acc -> {:cont, Enum.reverse(acc), []}
end
Enum.map(list, fn el -> is_list(el) && Enum.intersperse(el, nil) || el end)
|> List.flatten()
|> Enum.chunk_while([], chunk_fun, after_fun)
end
@doc "Returns just either :block, :inline or nil. Assumes that it doesn't contain both."
def get_sibling_collocation(content) do
content
|> Enum.map(fn node -> node.eph.sibling_with end)
|> Enum.uniq()
|> List.delete(:both)
|> List.first
end
end