From ae70b420a951180f3c1518e3b30de2e77da95b4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sofia=20Moreno=20O=C3=B1ate?= Date: Tue, 26 Mar 2024 15:48:08 +0100 Subject: [PATCH 1/2] Some functions to public --- lib/csv_sniffer.ex | 137 +++++++++++++++++++++++---------------------- mix.exs | 2 +- 2 files changed, 72 insertions(+), 67 deletions(-) diff --git a/lib/csv_sniffer.ex b/lib/csv_sniffer.ex index 4d00328..49a8be5 100644 --- a/lib/csv_sniffer.ex +++ b/lib/csv_sniffer.ex @@ -36,19 +36,81 @@ defmodule CsvSniffer do {:ok, Dialect.t()} | {:error, reason :: any()} def sniff(sample) when is_binary(sample) do sample - |> guess_quote_and_delimiter() + |> guess_quote() |> guess_delimiter(sample) |> format_response() end - # Looks for text enclosed between two identical quotes (the probable quotechar) which are - # preceded and followed by the same character (the probable delimiter). - # - # For example: - # ,'some text', - # The quote with the most wins, same with the delimiter. If there is no quotechar the delimiter - # can't be determined this way. - defp guess_quote_and_delimiter(sample) do + @doc """ + The delimiter /should/ occur the same number of times on each row. However, due to malformed + data, it may not. We don't want an all or nothing approach, so we allow for small variations + in this number. + 1) build a table of the frequency of each character on every line. + 2) build a table of frequencies of this frequency (meta-frequency?), e.g. 'x occurred 5 + times in 10 rows, 6 times in 1000 rows, 7 times in 2 rows' + 3) use the mode of the meta-frequency to determine the /expected/ frequency for that character + 4) find out how often the character actually meets that goal + 5) the character that best meets its goal is the delimiter + For performance reasons, the data is evaluated in chunks, so it can try and evaluate the + smallest portion of the data possible, evaluating additional chunks as necessary. + """ + @spec guess_delimiter(CsvSniffer.Dialect.t(), binary) :: map + def guess_delimiter(%Dialect{delimiter: nil} = dialect, sample) do + # inside this function, we can be sure there's no escape character ... + newline = find_newline_character(sample) + + split_sample = + sample + |> String.split(newline) + |> Stream.reject(&(String.trim(&1) == "")) + + initial_acc = %{frequency_tables: %{}, total: 0} + + delimiter = + split_sample + |> Stream.chunk_every(10) + |> Enum.reduce_while(initial_acc, fn chunk, + %{frequency_tables: frequency_tables, total: total} -> + new_total = total + length(chunk) + updated_frequency_tables = build_frequency_tables(chunk, frequency_tables) + + possible_delimiters = + updated_frequency_tables + |> get_mode_of_the_frequencies() + |> build_a_list_of_possible_delimiters(new_total) + |> Enum.filter(fn {k, _v} -> Enum.member?(@delimiters, k) end) + |> Enum.into(%{}) + + cont_or_halt = if possible_delimiters == %{}, do: :cont, else: :halt + + {cont_or_halt, + %{ + frequency_tables: updated_frequency_tables, + possible_delimiters: possible_delimiters, + total: new_total + }} + end) + |> Map.get(:possible_delimiters) + |> pick_delimiter() + + %Dialect{dialect | delimiter: delimiter} + end + + def guess_delimiter(dialect, _sample) do + IO.inspect("entra por el que") + dialect + end + + @doc """ + Looks for text enclosed between two identical quotes (the probable quotechar) which are + preceded and followed by the same character (the probable delimiter). + For example: + ,'some text', + The quote with the most wins, same with the delimiter. If there is no quotechar the delimiter + can't be determined this way. + """ + @spec guess_quote(binary) :: CsvSniffer.Dialect.t() + def guess_quote(sample) do sample |> run_quote_regex() |> count_matches() @@ -276,63 +338,6 @@ defmodule CsvSniffer do else: dialect end - # The delimiter /should/ occur the same number of times on each row. However, due to malformed - # data, it may not. We don't want an all or nothing approach, so we allow for small variations - # in this number. - # 1) build a table of the frequency of each character on every line. - # 2) build a table of frequencies of this frequency (meta-frequency?), e.g. 'x occurred 5 - # times in 10 rows, 6 times in 1000 rows, 7 times in 2 rows' - # 3) use the mode of the meta-frequency to determine the /expected/ frequency for that - # character - # 4) find out how often the character actually meets that goal - # 5) the character that best meets its goal is the delimiter - # For performance reasons, the data is evaluated in chunks, so it can try and evaluate the - # smallest portion of the data possible, evaluating additional chunks as necessary. - defp guess_delimiter(%Dialect{delimiter: nil} = dialect, sample) do - # inside this function, we can be sure there's no escape character ... - newline = find_newline_character(sample) - - split_sample = - sample - |> String.split(newline) - |> Stream.reject(&(String.trim(&1) == "")) - - initial_acc = %{frequency_tables: %{}, total: 0} - - delimiter = - split_sample - |> Stream.chunk_every(10) - |> Enum.reduce_while(initial_acc, fn chunk, - %{frequency_tables: frequency_tables, total: total} -> - new_total = total + length(chunk) - updated_frequency_tables = build_frequency_tables(chunk, frequency_tables) - - possible_delimiters = - updated_frequency_tables - |> get_mode_of_the_frequencies() - |> build_a_list_of_possible_delimiters(new_total) - |> Enum.filter(fn {k, _v} -> Enum.member?(@delimiters, k) end) - |> Enum.into(%{}) - - cont_or_halt = if possible_delimiters == %{}, do: :cont, else: :halt - - {cont_or_halt, - %{ - frequency_tables: updated_frequency_tables, - possible_delimiters: possible_delimiters, - total: new_total - }} - end) - |> Map.get(:possible_delimiters) - |> pick_delimiter() - - %Dialect{dialect | delimiter: delimiter} - end - - defp guess_delimiter(dialect, _sample) do - dialect - end - @seven_bit_ascii Enum.into(0..127, %{}, &{&1, 0}) defp build_frequency_tables(data, acc) do diff --git a/mix.exs b/mix.exs index 2659e1a..78d5970 100644 --- a/mix.exs +++ b/mix.exs @@ -2,7 +2,7 @@ defmodule CsvSniffer.MixProject do use Mix.Project @source_url "https://github.com/doofinder/csv_sniffer" - @version "0.2.8" + @version "0.2.9" def project do [ From 2160f87907549ceb3a89d3d925584d810e9bcdeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sofia=20Moreno=20O=C3=B1ate?= Date: Tue, 26 Mar 2024 18:54:59 +0100 Subject: [PATCH 2/2] remove --- lib/csv_sniffer.ex | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/csv_sniffer.ex b/lib/csv_sniffer.ex index 49a8be5..7466cb1 100644 --- a/lib/csv_sniffer.ex +++ b/lib/csv_sniffer.ex @@ -54,7 +54,6 @@ defmodule CsvSniffer do For performance reasons, the data is evaluated in chunks, so it can try and evaluate the smallest portion of the data possible, evaluating additional chunks as necessary. """ - @spec guess_delimiter(CsvSniffer.Dialect.t(), binary) :: map def guess_delimiter(%Dialect{delimiter: nil} = dialect, sample) do # inside this function, we can be sure there's no escape character ... newline = find_newline_character(sample) @@ -97,7 +96,6 @@ defmodule CsvSniffer do end def guess_delimiter(dialect, _sample) do - IO.inspect("entra por el que") dialect end