diff --git a/Makefile.am b/Makefile.am index f88dc46..b00999b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -12,7 +12,7 @@ licensedir = $(datarootdir)/licenses/$(TRANSFORMED_PACKAGE_NAME) bin_PROGRAMS = decasify decasify_SOURCES = src/bin/decasify.rs src/content.rs src/cli.rs src/lib.rs src/types.rs src/traits.rs decasify_SOURCES += src/lua.rs src/python.rs src/wasm.rs -decasify_SOURCES += src/en.rs src/tr.rs +decasify_SOURCES += src/en.rs src/fr.rs src/tr.rs EXTRA_decasify_SOURCES = tests/cli.rs tests/lib.rs EXTRA_DIST = pyproject.toml spec/decasify_spec.lua tests/test_all.py plugin/decasify.lua sile/decasify.lua dist_doc_DATA = README.md CHANGELOG.md diff --git a/pyproject.toml b/pyproject.toml index 0630848..42335c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Natural Language :: English", + "Natural Language :: French", "Natural Language :: Turkish", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", diff --git a/src/fr.rs b/src/fr.rs new file mode 100644 index 0000000..6a48655 --- /dev/null +++ b/src/fr.rs @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: © 2023 Caleb Maclennan +// SPDX-License-Identifier: LGPL-3.0-only + +use crate::content::{Chunk, Segment, Word}; +use crate::types::StyleGuide; + +use regex::Regex; +use unicode_titlecase::StrTitleCase; + +pub use crate::en::lowercase; +pub use crate::en::sentencecase; +pub use crate::en::uppercase; + +pub fn titlecase(chunk: Chunk, style: StyleGuide) -> String { + match style { + StyleGuide::LanguageDefault => titlecase_fr(chunk), + _ => todo!("French implementation doesn't support this style guide."), + } +} + +fn titlecase_fr(chunk: Chunk) -> String { + let mut segments: Vec = Vec::new(); + chunk.clone().segments.into_iter().for_each(|segment| { + match segment { + Segment::Separator(_) => segments.push(segment), + Segment::Word(ref word) => { + let mut segs = word.word.split("-").peekable(); + while let Some(s) = segs.next() { + segments.push(Segment::Word(Word { word: s.into() })); + if segs.peek().is_some() { + segments.push(Segment::Separator("-".into())); + } + } + } + }; + }); + let mut chunk = chunk.clone(); + let mut words = chunk + .segments + .iter_mut() + .filter_map(|segment| match segment { + Segment::Word(word) => Some(word), + _ => None, + }) + .peekable(); + if let Some(word) = words.next() { + word.word = word.to_titlecase_lower_rest(); + } + while let Some(word) = words.next() { + word.word = match words.peek().is_none() { + true => word.to_titlecase_lower_rest(), + false => match is_reserved(word) { + true => word.to_lowercase(), + false => word.to_titlecase_lower_rest(), + }, + }; + } + chunk.into() +} + +fn is_reserved(word: &Word) -> bool { + let word = word.to_lowercase(); + let word = word.as_str(); + // https://github.com/benoitvallon/titlecase-french/blob/83e092e91dccdd39871dfeac0d58dc06d997dabb/config.js#L22 + let lower_case_word_list = vec![ + "le", "la", "les", // definite articles + "un", "une", "des", // indefinite articles + "du", "de", "des", // partitive articles + "au", "aux", "du", "des", // contracted articles + "ce", "cet", "cette", "ces", // demonstrative adjectives + "quel", "quels", "quelle", "quelles", // exclamative adjectives + "mon", "ton", "son", "notre", "votre", "leur", "ma", "ta", "sa", "mes", "tes", "ses", + "nos", "vos", "leurs", // possessive adjectives + "mais", "ou", "et", "donc", "or", "ni", "car", "voire", + // coordinating conjunctions + "que", "qu", "quand", "comme", "si", "lorsque", "lorsqu", "puisque", "puisqu", "quoique", + "quoiqu", // subordinating conjunctions + "à", "chez", "dans", "entre", "jusque", "jusqu", "hors", "par", "pour", "sans", "vers", + "sur", "pas", "parmi", "avec", "sous", "en", // prepositions + "je", "tu", "il", "elle", "on", "nous", "vous", "ils", "elles", "me", "te", "se", "y", + // personal pronouns + "qui", "que", "quoi", "dont", "où", // relative pronouns + "ne", // others + ]; + let lower_case_words = lower_case_word_list.join("|"); + let lower_case_word = Regex::new(format!("^({lower_case_words})$").as_ref()).unwrap(); + lower_case_word.is_match(word) +} + +//capitalizedSpecials: [ +// { input: 'À', output: 'A'}, +// { input: 'Â', output: 'A'}, +// { input: 'Ä', output: 'A'}, +// { input: 'É', output: 'E'}, +// { input: 'È', output: 'E'}, +// { input: 'Ê', output: 'E'}, +// { input: 'Ë', output: 'E'}, +// { input: 'Ç', output: 'C'}, +// { input: 'Î', output: 'I'}, +// { input: 'Ï', output: 'I'}, +// { input: 'Ô', output: 'O'}, +// { input: 'Ö', output: 'O'}, +// { input: 'Û', output: 'U'}, +// { input: 'Ü', output: 'U'}, +// { input: 'Ù', output: 'U'} +//], diff --git a/src/lib.rs b/src/lib.rs index 7d07406..05f6de9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,6 +30,7 @@ pub mod python; pub mod wasm; mod en; +mod fr; mod tr; /// Convert a string to a specific case following typesetting conventions for a target locale @@ -62,6 +63,7 @@ pub fn titlecase( let style: StyleGuide = style.into(); match locale { Locale::EN => en::titlecase(chunk, style), + Locale::FR => fr::titlecase(chunk, style), Locale::TR => tr::titlecase(chunk, style), } } @@ -72,6 +74,7 @@ pub fn lowercase(chunk: impl Into, locale: impl Into) -> String { let locale: Locale = locale.into(); match locale { Locale::EN => en::lowercase(chunk), + Locale::FR => fr::lowercase(chunk), Locale::TR => tr::lowercase(chunk), } } @@ -82,6 +85,7 @@ pub fn uppercase(chunk: impl Into, locale: impl Into) -> String { let locale: Locale = locale.into(); match locale { Locale::EN => en::uppercase(chunk), + Locale::FR => fr::uppercase(chunk), Locale::TR => tr::uppercase(chunk), } } @@ -92,6 +96,7 @@ pub fn sentencecase(chunk: impl Into, locale: impl Into) -> Strin let locale: Locale = locale.into(); match locale { Locale::EN => en::sentencecase(chunk), + Locale::FR => fr::sentencecase(chunk), Locale::TR => tr::sentencecase(chunk), } } diff --git a/src/types.rs b/src/types.rs index 42d1e8d..dc0bc6d 100644 --- a/src/types.rs +++ b/src/types.rs @@ -43,6 +43,7 @@ pub type Result = std::result::Result; pub enum Locale { #[default] EN, + FR, TR, } @@ -85,6 +86,7 @@ impl FromStr for Locale { fn from_str(s: &str) -> Result { match s.to_ascii_lowercase().as_str() { "en" | "english" | "en_en" => Ok(Locale::EN), + "fr" | "french" | "fr_fr" | "français" => Ok(Locale::FR), "tr" | "turkish" | "tr_tr" | "türkçe" => Ok(Locale::TR), input => LocaleSnafu { input }.fail()?, } diff --git a/tests/lib.rs b/tests/lib.rs index 63f5dd7..7246e3b 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -213,6 +213,38 @@ titlecase!( " Serbest Serseri\n Boşluk " ); +titlecase!( + french_def, + Locale::FR, + StyleGuide::LanguageDefault, + "le triangle rouge", + "Le Triangle Rouge" +); + +titlecase!( + french_def2, + Locale::FR, + StyleGuide::LanguageDefault, + "loki, le détective mythique", + "Loki, le Détective Mythique" +); + +titlecase!( + french_coordinating, + Locale::FR, + StyleGuide::LanguageDefault, + "il est studieux mais turbulent", + "Il Est Studieux mais Turbulent" +); + +titlecase!( + french_coordinating2, + Locale::FR, + StyleGuide::LanguageDefault, + "mais comment font-ils?", + "Mais Comment Font-Ils?" +); + macro_rules! lowercase { ($name:ident, $locale:expr, $input:expr, $expected:expr) => { #[test] @@ -225,6 +257,8 @@ macro_rules! lowercase { lowercase!(lower_en, Locale::EN, "foo BAR BaZ BIKE", "foo bar baz bike"); +lowercase!(lower_fr, Locale::FR, "foo BAR BaZ BIKE", "foo bar baz bike"); + lowercase!( lower_tr, Locale::TR, @@ -244,6 +278,8 @@ macro_rules! uppercase { uppercase!(upper_en, Locale::EN, "foo BAR BaZ bike", "FOO BAR BAZ BIKE"); +uppercase!(upper_fr, Locale::FR, "foo BAR BaZ bike", "FOO BAR BAZ BIKE"); + uppercase!( upper_tr, Locale::TR, @@ -268,4 +304,11 @@ sentencecase!( "Insert bike here" ); +sentencecase!( + sentence_fr, + Locale::FR, + "insert BIKE here", + "Insert bike here" +); + sentencecase!(sentence_tr, Locale::TR, "ilk DAVRANSIN", "İlk davransın");