diff --git a/_bibliography/preprints.bib b/_bibliography/preprints.bib index 2d5bcc9..a917fbf 100644 --- a/_bibliography/preprints.bib +++ b/_bibliography/preprints.bib @@ -1,7 +1,7 @@ @misc{ploeger2024what, - title = {What Is '{{Typological Diversity}}' in {{NLP}}?}, - author = {Ploeger, Esther and Poelman, Wessel and {de Lhoneux}, Miryam and Bjerva, Johannes}, + title = {What Is ''{{Typological Diversity}}'' in {{NLP}}?}, + author = {Ploeger*, Esther and Poelman*, Wessel and {de Lhoneux}, Miryam and Bjerva, Johannes}, year = {2024}, month = feb, number = {arXiv:2402.04222}, @@ -16,3 +16,21 @@ @misc{ploeger2024what abbr = {arXiv}, bibtex_show = true } + +@misc{ploeger2024principled, + title = {A {{Principled Framework}} for {{Evaluating}} on {{Typologically Diverse Languages}}}, + author = {Ploeger, Esther and Poelman, Wessel and {H{\o}eg-Petersen}, Andreas Holck and Schlichtkrull, Anders and {de Lhoneux}, Miryam and Bjerva, Johannes}, + year = {2024}, + month = jul, + number = {arXiv:2407.05022}, + eprint = {2407.05022}, + url = {http://arxiv.org/abs/2407.05022}, + urldate = {2024-07-22}, + abstract = {Beyond individual languages, multilingual natural language processing (NLP) research increasingly aims to develop models that perform well across languages generally. However, evaluating these systems on all the world's languages is practically infeasible. To attain generalizability, representative language sampling is essential. Previous work argues that generalizable multilingual evaluation sets should contain languages with diverse typological properties. However, 'typologically diverse' language samples have been found to vary considerably in this regard, and popular sampling methods are flawed and inconsistent. We present a language sampling framework for selecting highly typologically diverse languages given a sampling frame, informed by language typology. We compare sampling methods with a range of metrics and find that our systematic methods consistently retrieve more typologically diverse language selections than previous methods in NLP. Moreover, we provide evidence that this affects generalizability in multilingual model evaluation, emphasizing the importance of diverse language sampling in NLP evaluation.}, + keywords = {Computer Science - Computation and Language}, + archiveprefix = {arxiv}, + abbr = {arXiv}, + primaryclass = {cs}, + publisher = {arXiv}, + bibtex_show = true +} diff --git a/_bibliography/workshops.bib b/_bibliography/workshops.bib index a7ba744..d74ffd1 100644 --- a/_bibliography/workshops.bib +++ b/_bibliography/workshops.bib @@ -44,7 +44,7 @@ @inproceedings{tatariya_sociolinguistically_2024 @inproceedings{poelman2024call, title = {A {{Call}} for {{Consistency}} in {{Reporting Typological Diversity}}}, booktitle = {Proceedings of the 6th {{Workshop}} on {{Research}} in {{Computational Linguistic Typology}} and {{Multilingual NLP}}}, - author = {Poelman, Wessel and Ploeger, Esther and {de Lhoneux}, Miryam and Bjerva, Johannes}, + author = {Poelman*, Wessel and Ploeger*, Esther and {de Lhoneux}, Miryam and Bjerva, Johannes}, editor = {Hahn, Michael and Sorokin, Alexey and Kumar, Ritesh and Shcherbakov, Andreas and Otmakhova, Yulia and Yang, Jinrui and Serikov, Oleg and Rani, Priya and Ponti, Edoardo M. and Murado{\u g}lu, Saliha and Gao, Rena and Cotterell, Ryan and Vylomova, Ekaterina}, year = {2024}, month = mar, diff --git a/_news/2024-08-30-clin.md b/_news/2024-08-30-clin.md new file mode 100644 index 0000000..e5305cb --- /dev/null +++ b/_news/2024-08-30-clin.md @@ -0,0 +1,8 @@ +--- +layout: page + +title: CLIN34 +date: August 30, 2024 +--- + +LAGoM will be at the 34th Meeting of [CLIN](https://clin34.leidenuniv.nl/) (Computational Linguistics in the Netherlands), at Leiden University. Kushal will present a poster about ongoing work titled ['How Good is Your Wikipedia? Quality Estimation and Methods of Data Pruning for Non-English Wikipedias'](https://clin34.leidenuniv.nl/abstracts/how-good-is-your-wikipedia-quality-estimation-and-methods-of-data-pruning-for-non-english-wikipedias/). Wessel will present a poster covering two recent pre-prints titled ['Leveraging Linguistic Typology for Fairer Multilingual Language Technology'](https://clin34.leidenuniv.nl/abstracts/leveraging-linguistic-typology-for-fairer-multilingual-language-technology/).