Skip to content

Commit

Permalink
Merge pull request #7 from doofinder/Sofia/change_functions_to_public
Browse files Browse the repository at this point in the history
Some functions to public
  • Loading branch information
sofia-doofinder authored Mar 27, 2024
2 parents 7d84d00 + 2160f87 commit fdbc0e5
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 67 deletions.
135 changes: 69 additions & 66 deletions lib/csv_sniffer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,79 @@ defmodule CsvSniffer do
{:ok, Dialect.t()} | {:error, reason :: any()}
def sniff(sample) when is_binary(sample) do
sample
|> guess_quote_and_delimiter()
|> guess_quote()
|> guess_delimiter(sample)
|> format_response()
end

# Looks for text enclosed between two identical quotes (the probable quotechar) which are
# preceded and followed by the same character (the probable delimiter).
#
# For example:
# ,'some text',
# The quote with the most wins, same with the delimiter. If there is no quotechar the delimiter
# can't be determined this way.
defp guess_quote_and_delimiter(sample) do
@doc """
The delimiter /should/ occur the same number of times on each row. However, due to malformed
data, it may not. We don't want an all or nothing approach, so we allow for small variations
in this number.
1) build a table of the frequency of each character on every line.
2) build a table of frequencies of this frequency (meta-frequency?), e.g. 'x occurred 5
times in 10 rows, 6 times in 1000 rows, 7 times in 2 rows'
3) use the mode of the meta-frequency to determine the /expected/ frequency for that character
4) find out how often the character actually meets that goal
5) the character that best meets its goal is the delimiter
For performance reasons, the data is evaluated in chunks, so it can try and evaluate the
smallest portion of the data possible, evaluating additional chunks as necessary.
"""
def guess_delimiter(%Dialect{delimiter: nil} = dialect, sample) do
# inside this function, we can be sure there's no escape character ...
newline = find_newline_character(sample)

split_sample =
sample
|> String.split(newline)
|> Stream.reject(&(String.trim(&1) == ""))

initial_acc = %{frequency_tables: %{}, total: 0}

delimiter =
split_sample
|> Stream.chunk_every(10)
|> Enum.reduce_while(initial_acc, fn chunk,
%{frequency_tables: frequency_tables, total: total} ->
new_total = total + length(chunk)
updated_frequency_tables = build_frequency_tables(chunk, frequency_tables)

possible_delimiters =
updated_frequency_tables
|> get_mode_of_the_frequencies()
|> build_a_list_of_possible_delimiters(new_total)
|> Enum.filter(fn {k, _v} -> Enum.member?(@delimiters, k) end)
|> Enum.into(%{})

cont_or_halt = if possible_delimiters == %{}, do: :cont, else: :halt

{cont_or_halt,
%{
frequency_tables: updated_frequency_tables,
possible_delimiters: possible_delimiters,
total: new_total
}}
end)
|> Map.get(:possible_delimiters)
|> pick_delimiter()

%Dialect{dialect | delimiter: delimiter}
end

def guess_delimiter(dialect, _sample) do
dialect
end

@doc """
Looks for text enclosed between two identical quotes (the probable quotechar) which are
preceded and followed by the same character (the probable delimiter).
For example:
,'some text',
The quote with the most wins, same with the delimiter. If there is no quotechar the delimiter
can't be determined this way.
"""
@spec guess_quote(binary) :: CsvSniffer.Dialect.t()
def guess_quote(sample) do
sample
|> run_quote_regex()
|> count_matches()
Expand Down Expand Up @@ -276,63 +336,6 @@ defmodule CsvSniffer do
else: dialect
end

# The delimiter /should/ occur the same number of times on each row. However, due to malformed
# data, it may not. We don't want an all or nothing approach, so we allow for small variations
# in this number.
# 1) build a table of the frequency of each character on every line.
# 2) build a table of frequencies of this frequency (meta-frequency?), e.g. 'x occurred 5
# times in 10 rows, 6 times in 1000 rows, 7 times in 2 rows'
# 3) use the mode of the meta-frequency to determine the /expected/ frequency for that
# character
# 4) find out how often the character actually meets that goal
# 5) the character that best meets its goal is the delimiter
# For performance reasons, the data is evaluated in chunks, so it can try and evaluate the
# smallest portion of the data possible, evaluating additional chunks as necessary.
defp guess_delimiter(%Dialect{delimiter: nil} = dialect, sample) do
# inside this function, we can be sure there's no escape character ...
newline = find_newline_character(sample)

split_sample =
sample
|> String.split(newline)
|> Stream.reject(&(String.trim(&1) == ""))

initial_acc = %{frequency_tables: %{}, total: 0}

delimiter =
split_sample
|> Stream.chunk_every(10)
|> Enum.reduce_while(initial_acc, fn chunk,
%{frequency_tables: frequency_tables, total: total} ->
new_total = total + length(chunk)
updated_frequency_tables = build_frequency_tables(chunk, frequency_tables)

possible_delimiters =
updated_frequency_tables
|> get_mode_of_the_frequencies()
|> build_a_list_of_possible_delimiters(new_total)
|> Enum.filter(fn {k, _v} -> Enum.member?(@delimiters, k) end)
|> Enum.into(%{})

cont_or_halt = if possible_delimiters == %{}, do: :cont, else: :halt

{cont_or_halt,
%{
frequency_tables: updated_frequency_tables,
possible_delimiters: possible_delimiters,
total: new_total
}}
end)
|> Map.get(:possible_delimiters)
|> pick_delimiter()

%Dialect{dialect | delimiter: delimiter}
end

defp guess_delimiter(dialect, _sample) do
dialect
end

@seven_bit_ascii Enum.into(0..127, %{}, &{&1, 0})

defp build_frequency_tables(data, acc) do
Expand Down
2 changes: 1 addition & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule CsvSniffer.MixProject do
use Mix.Project

@source_url "https://github.com/doofinder/csv_sniffer"
@version "0.2.8"
@version "0.2.9"

def project do
[
Expand Down

0 comments on commit fdbc0e5

Please sign in to comment.