From 951ee408aa563c281f8be2254d759036622c2c23 Mon Sep 17 00:00:00 2001 From: Caleb Maclennan Date: Sun, 27 Oct 2024 11:16:15 +0300 Subject: [PATCH 1/5] chore(crate): Spin up space for a French implementation --- Makefile.am | 2 +- pyproject.toml | 1 + src/fr.rs | 20 ++++++++++++++++++++ src/lib.rs | 5 +++++ src/types.rs | 2 ++ 5 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 src/fr.rs diff --git a/Makefile.am b/Makefile.am index f88dc46..b00999b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -12,7 +12,7 @@ licensedir = $(datarootdir)/licenses/$(TRANSFORMED_PACKAGE_NAME) bin_PROGRAMS = decasify decasify_SOURCES = src/bin/decasify.rs src/content.rs src/cli.rs src/lib.rs src/types.rs src/traits.rs decasify_SOURCES += src/lua.rs src/python.rs src/wasm.rs -decasify_SOURCES += src/en.rs src/tr.rs +decasify_SOURCES += src/en.rs src/fr.rs src/tr.rs EXTRA_decasify_SOURCES = tests/cli.rs tests/lib.rs EXTRA_DIST = pyproject.toml spec/decasify_spec.lua tests/test_all.py plugin/decasify.lua sile/decasify.lua dist_doc_DATA = README.md CHANGELOG.md diff --git a/pyproject.toml b/pyproject.toml index 0630848..42335c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Natural Language :: English", + "Natural Language :: French", "Natural Language :: Turkish", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", diff --git a/src/fr.rs b/src/fr.rs new file mode 100644 index 0000000..8c0657f --- /dev/null +++ b/src/fr.rs @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: © 2023 Caleb Maclennan +// SPDX-License-Identifier: LGPL-3.0-only + +use crate::{Chunk, StyleGuide}; + +pub fn titlecase(_chunk: Chunk, _style: StyleGuide) -> String { + todo!(); +} + +pub fn lowercase(_chunk: Chunk) -> String { + todo!(); +} + +pub fn uppercase(_chunk: Chunk) -> String { + todo!(); +} + +pub fn sentencecase(_chunk: Chunk) -> String { + todo!(); +} diff --git a/src/lib.rs b/src/lib.rs index 7d07406..05f6de9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,6 +30,7 @@ pub mod python; pub mod wasm; mod en; +mod fr; mod tr; /// Convert a string to a specific case following typesetting conventions for a target locale @@ -62,6 +63,7 @@ pub fn titlecase( let style: StyleGuide = style.into(); match locale { Locale::EN => en::titlecase(chunk, style), + Locale::FR => fr::titlecase(chunk, style), Locale::TR => tr::titlecase(chunk, style), } } @@ -72,6 +74,7 @@ pub fn lowercase(chunk: impl Into, locale: impl Into) -> String { let locale: Locale = locale.into(); match locale { Locale::EN => en::lowercase(chunk), + Locale::FR => fr::lowercase(chunk), Locale::TR => tr::lowercase(chunk), } } @@ -82,6 +85,7 @@ pub fn uppercase(chunk: impl Into, locale: impl Into) -> String { let locale: Locale = locale.into(); match locale { Locale::EN => en::uppercase(chunk), + Locale::FR => fr::uppercase(chunk), Locale::TR => tr::uppercase(chunk), } } @@ -92,6 +96,7 @@ pub fn sentencecase(chunk: impl Into, locale: impl Into) -> Strin let locale: Locale = locale.into(); match locale { Locale::EN => en::sentencecase(chunk), + Locale::FR => fr::sentencecase(chunk), Locale::TR => tr::sentencecase(chunk), } } diff --git a/src/types.rs b/src/types.rs index 42d1e8d..dc0bc6d 100644 --- a/src/types.rs +++ b/src/types.rs @@ -43,6 +43,7 @@ pub type Result = std::result::Result; pub enum Locale { #[default] EN, + FR, TR, } @@ -85,6 +86,7 @@ impl FromStr for Locale { fn from_str(s: &str) -> Result { match s.to_ascii_lowercase().as_str() { "en" | "english" | "en_en" => Ok(Locale::EN), + "fr" | "french" | "fr_fr" | "français" => Ok(Locale::FR), "tr" | "turkish" | "tr_tr" | "türkçe" => Ok(Locale::TR), input => LocaleSnafu { input }.fail()?, } From eb729ebbe6370f0e9833b59aaf7ad1ef20347464 Mon Sep 17 00:00:00 2001 From: Caleb Maclennan Date: Sun, 27 Oct 2024 13:00:11 +0300 Subject: [PATCH 2/5] chore(crate): Delegate French lower, upper, and sentence case to English --- src/fr.rs | 19 ++++++------------- tests/lib.rs | 11 +++++++++++ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/fr.rs b/src/fr.rs index 8c0657f..72f465a 100644 --- a/src/fr.rs +++ b/src/fr.rs @@ -1,20 +1,13 @@ // SPDX-FileCopyrightText: © 2023 Caleb Maclennan // SPDX-License-Identifier: LGPL-3.0-only -use crate::{Chunk, StyleGuide}; +use crate::content::{Chunk, Segment}; +use crate::types::StyleGuide; -pub fn titlecase(_chunk: Chunk, _style: StyleGuide) -> String { - todo!(); -} - -pub fn lowercase(_chunk: Chunk) -> String { - todo!(); -} +pub use crate::en::lowercase; +pub use crate::en::sentencecase; +pub use crate::en::uppercase; -pub fn uppercase(_chunk: Chunk) -> String { - todo!(); -} - -pub fn sentencecase(_chunk: Chunk) -> String { +pub fn titlecase(_chunk: Chunk, _style: StyleGuide) -> String { todo!(); } diff --git a/tests/lib.rs b/tests/lib.rs index 63f5dd7..7e4fac9 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -225,6 +225,8 @@ macro_rules! lowercase { lowercase!(lower_en, Locale::EN, "foo BAR BaZ BIKE", "foo bar baz bike"); +lowercase!(lower_fr, Locale::FR, "foo BAR BaZ BIKE", "foo bar baz bike"); + lowercase!( lower_tr, Locale::TR, @@ -244,6 +246,8 @@ macro_rules! uppercase { uppercase!(upper_en, Locale::EN, "foo BAR BaZ bike", "FOO BAR BAZ BIKE"); +uppercase!(upper_fr, Locale::FR, "foo BAR BaZ bike", "FOO BAR BAZ BIKE"); + uppercase!( upper_tr, Locale::TR, @@ -268,4 +272,11 @@ sentencecase!( "Insert bike here" ); +sentencecase!( + sentence_fr, + Locale::FR, + "insert BIKE here", + "Insert bike here" +); + sentencecase!(sentence_tr, Locale::TR, "ilk DAVRANSIN", "İlk davransın"); From 206c58d5a08489855889bc17a57e6fc36d9a1bea Mon Sep 17 00:00:00 2001 From: Caleb Maclennan Date: Sun, 27 Oct 2024 15:48:34 +0300 Subject: [PATCH 3/5] feat(crate): Implement French title casing Loosly ported from https://github.com/benoitvallon/titlecase-french --- src/fr.rs | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 3 deletions(-) diff --git a/src/fr.rs b/src/fr.rs index 72f465a..0bfcdc9 100644 --- a/src/fr.rs +++ b/src/fr.rs @@ -1,13 +1,91 @@ // SPDX-FileCopyrightText: © 2023 Caleb Maclennan // SPDX-License-Identifier: LGPL-3.0-only -use crate::content::{Chunk, Segment}; +use crate::content::{Chunk, Segment, Word}; use crate::types::StyleGuide; +use regex::Regex; +use unicode_titlecase::StrTitleCase; + pub use crate::en::lowercase; pub use crate::en::sentencecase; pub use crate::en::uppercase; -pub fn titlecase(_chunk: Chunk, _style: StyleGuide) -> String { - todo!(); +pub fn titlecase(chunk: Chunk, style: StyleGuide) -> String { + match style { + StyleGuide::LanguageDefault => titlecase_fr(chunk), + _ => todo!("French implementation doesn't support this style guide."), + } +} + +fn titlecase_fr(chunk: Chunk) -> String { + let mut chunk = chunk.clone(); + let mut words = chunk + .segments + .iter_mut() + .filter_map(|segment| match segment { + Segment::Word(word) => Some(word), + _ => None, + }) + .peekable(); + if let Some(word) = words.next() { + word.word = word.to_titlecase_lower_rest(); + } + while let Some(word) = words.next() { + word.word = match words.peek().is_none() { + true => word.to_titlecase_lower_rest(), + false => match is_reserved(word) { + true => word.to_lowercase(), + false => word.to_titlecase_lower_rest(), + }, + }; + } + chunk.into() } + +fn is_reserved(word: &Word) -> bool { + let word = word.to_lowercase(); + let word = word.as_str(); + // https://github.com/benoitvallon/titlecase-french/blob/83e092e91dccdd39871dfeac0d58dc06d997dabb/config.js#L22 + let lower_case_word_list = vec![ + "le", "la", "les", // definite articles + "un", "une", "des", // indefinite articles + "du", "de", "des", // partitive articles + "au", "aux", "du", "des", // contracted articles + "ce", "cet", "cette", "ces", // demonstrative adjectives + "quel", "quels", "quelle", "quelles", // exclamative adjectives + "mon", "ton", "son", "notre", "votre", "leur", "ma", "ta", "sa", "mes", "tes", "ses", + "nos", "vos", "leurs", // possessive adjectives + "mais", "ou", "et", "donc", "or", "ni", "car", "voire", + // coordinating conjunctions + "que", "qu", "quand", "comme", "si", "lorsque", "lorsqu", "puisque", "puisqu", "quoique", + "quoiqu", // subordinating conjunctions + "à", "chez", "dans", "entre", "jusque", "jusqu", "hors", "par", "pour", "sans", "vers", + "sur", "pas", "parmi", "avec", "sous", "en", // prepositions + "je", "tu", "il", "elle", "on", "nous", "vous", "ils", "elles", "me", "te", "se", "y", + // personal pronouns + "qui", "que", "quoi", "dont", "où", // relative pronouns + "ne", // others + ]; + let lower_case_words = lower_case_word_list.join("|"); + let lower_case_word = Regex::new(format!("^({lower_case_words})$").as_ref()).unwrap(); + lower_case_word.is_match(word) +} + +//capitalizedSpecials: [ +// { input: 'À', output: 'A'}, +// { input: 'Â', output: 'A'}, +// { input: 'Ä', output: 'A'}, +// { input: 'É', output: 'E'}, +// { input: 'È', output: 'E'}, +// { input: 'Ê', output: 'E'}, +// { input: 'Ë', output: 'E'}, +// { input: 'Ç', output: 'C'}, +// { input: 'Î', output: 'I'}, +// { input: 'Ï', output: 'I'}, +// { input: 'Ô', output: 'O'}, +// { input: 'Ö', output: 'O'}, +// { input: 'Û', output: 'U'}, +// { input: 'Ü', output: 'U'}, +// { input: 'Ù', output: 'U'} +//], From c12caef519f5c0b0f84ad98ca02563d903dad726 Mon Sep 17 00:00:00 2001 From: Caleb Maclennan Date: Mon, 28 Oct 2024 19:02:38 +0300 Subject: [PATCH 4/5] test(crate): Add tests for French titlecasing --- tests/lib.rs | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/lib.rs b/tests/lib.rs index 7e4fac9..7246e3b 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -213,6 +213,38 @@ titlecase!( " Serbest Serseri\n Boşluk " ); +titlecase!( + french_def, + Locale::FR, + StyleGuide::LanguageDefault, + "le triangle rouge", + "Le Triangle Rouge" +); + +titlecase!( + french_def2, + Locale::FR, + StyleGuide::LanguageDefault, + "loki, le détective mythique", + "Loki, le Détective Mythique" +); + +titlecase!( + french_coordinating, + Locale::FR, + StyleGuide::LanguageDefault, + "il est studieux mais turbulent", + "Il Est Studieux mais Turbulent" +); + +titlecase!( + french_coordinating2, + Locale::FR, + StyleGuide::LanguageDefault, + "mais comment font-ils?", + "Mais Comment Font-Ils?" +); + macro_rules! lowercase { ($name:ident, $locale:expr, $input:expr, $expected:expr) => { #[test] From 1cf0159896e232b5bc8bfa60edbb3d961a995992 Mon Sep 17 00:00:00 2001 From: Caleb Maclennan Date: Tue, 29 Oct 2024 11:31:22 +0300 Subject: [PATCH 5/5] chore(crate): Handle hyphenated compound words in French --- src/fr.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/fr.rs b/src/fr.rs index 0bfcdc9..6a48655 100644 --- a/src/fr.rs +++ b/src/fr.rs @@ -19,6 +19,21 @@ pub fn titlecase(chunk: Chunk, style: StyleGuide) -> String { } fn titlecase_fr(chunk: Chunk) -> String { + let mut segments: Vec = Vec::new(); + chunk.clone().segments.into_iter().for_each(|segment| { + match segment { + Segment::Separator(_) => segments.push(segment), + Segment::Word(ref word) => { + let mut segs = word.word.split("-").peekable(); + while let Some(s) = segs.next() { + segments.push(Segment::Word(Word { word: s.into() })); + if segs.peek().is_some() { + segments.push(Segment::Separator("-".into())); + } + } + } + }; + }); let mut chunk = chunk.clone(); let mut words = chunk .segments