Skip to content

Commit

Permalink
news + preprint
Browse files Browse the repository at this point in the history
  • Loading branch information
WPoelman committed Aug 15, 2024
1 parent 034ce8b commit 24c7f75
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 3 deletions.
22 changes: 20 additions & 2 deletions _bibliography/preprints.bib
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

@misc{ploeger2024what,
title = {What Is '{{Typological Diversity}}' in {{NLP}}?},
author = {Ploeger, Esther and Poelman, Wessel and {de Lhoneux}, Miryam and Bjerva, Johannes},
title = {What Is ''{{Typological Diversity}}'' in {{NLP}}?},
author = {Ploeger*, Esther and Poelman*, Wessel and {de Lhoneux}, Miryam and Bjerva, Johannes},
year = {2024},
month = feb,
number = {arXiv:2402.04222},
Expand All @@ -16,3 +16,21 @@ @misc{ploeger2024what
abbr = {arXiv},
bibtex_show = true
}

@misc{ploeger2024principled,
title = {A {{Principled Framework}} for {{Evaluating}} on {{Typologically Diverse Languages}}},
author = {Ploeger, Esther and Poelman, Wessel and {H{\o}eg-Petersen}, Andreas Holck and Schlichtkrull, Anders and {de Lhoneux}, Miryam and Bjerva, Johannes},
year = {2024},
month = jul,
number = {arXiv:2407.05022},
eprint = {2407.05022},
url = {http://arxiv.org/abs/2407.05022},
urldate = {2024-07-22},
abstract = {Beyond individual languages, multilingual natural language processing (NLP) research increasingly aims to develop models that perform well across languages generally. However, evaluating these systems on all the world's languages is practically infeasible. To attain generalizability, representative language sampling is essential. Previous work argues that generalizable multilingual evaluation sets should contain languages with diverse typological properties. However, 'typologically diverse' language samples have been found to vary considerably in this regard, and popular sampling methods are flawed and inconsistent. We present a language sampling framework for selecting highly typologically diverse languages given a sampling frame, informed by language typology. We compare sampling methods with a range of metrics and find that our systematic methods consistently retrieve more typologically diverse language selections than previous methods in NLP. Moreover, we provide evidence that this affects generalizability in multilingual model evaluation, emphasizing the importance of diverse language sampling in NLP evaluation.},
keywords = {Computer Science - Computation and Language},
archiveprefix = {arxiv},
abbr = {arXiv},
primaryclass = {cs},
publisher = {arXiv},
bibtex_show = true
}
2 changes: 1 addition & 1 deletion _bibliography/workshops.bib
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ @inproceedings{tatariya_sociolinguistically_2024
@inproceedings{poelman2024call,
title = {A {{Call}} for {{Consistency}} in {{Reporting Typological Diversity}}},
booktitle = {Proceedings of the 6th {{Workshop}} on {{Research}} in {{Computational Linguistic Typology}} and {{Multilingual NLP}}},
author = {Poelman, Wessel and Ploeger, Esther and {de Lhoneux}, Miryam and Bjerva, Johannes},
author = {Poelman*, Wessel and Ploeger*, Esther and {de Lhoneux}, Miryam and Bjerva, Johannes},
editor = {Hahn, Michael and Sorokin, Alexey and Kumar, Ritesh and Shcherbakov, Andreas and Otmakhova, Yulia and Yang, Jinrui and Serikov, Oleg and Rani, Priya and Ponti, Edoardo M. and Murado{\u g}lu, Saliha and Gao, Rena and Cotterell, Ryan and Vylomova, Ekaterina},
year = {2024},
month = mar,
Expand Down
8 changes: 8 additions & 0 deletions _news/2024-08-30-clin.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
layout: page

title: CLIN34
date: August 30, 2024
---

LAGoM will be at the 34th Meeting of [CLIN](https://clin34.leidenuniv.nl/) (Computational Linguistics in the Netherlands), at Leiden University. Kushal will present a poster about ongoing work titled ['How Good is Your Wikipedia? Quality Estimation and Methods of Data Pruning for Non-English Wikipedias'](https://clin34.leidenuniv.nl/abstracts/how-good-is-your-wikipedia-quality-estimation-and-methods-of-data-pruning-for-non-english-wikipedias/). Wessel will present a poster covering two recent pre-prints titled ['Leveraging Linguistic Typology for Fairer Multilingual Language Technology'](https://clin34.leidenuniv.nl/abstracts/leveraging-linguistic-typology-for-fairer-multilingual-language-technology/).

0 comments on commit 24c7f75

Please sign in to comment.