Bie+05
Abstract:This work presents an unsupervised solution to language identification. The method sorts multilingual text corpora on the basis of sentences into the different languages that are contained and makes no assumptions on the number or size of the monolingual fractions. Evaluation on 7-lingual corpora and bilingual corpora show that the quality of classification is comparable to supervised approaches and works almost error-free from 100 sentences per language on.
Type: InproceedingsAuthor: Biemann, C. and Teresniak, S.
Title: Disentangling from Babylonian Confusion - Unsupervized Language Identification
Booktitle: Proceedings of CICLing-2005, Computational Linguistics and Intelligent Text Processing
Year: 2005
Pages:762-773
Publisher:Springer
@INPROCEEDINGS{Bie+05,
AUTHOR = {Biemann, C. and Teresniak, S.},
TITLE = {Disentangling from Babylonian Confusion - Unsupervized Language Identification },
BOOKTITLE = {Proceedings of CICLing-2005, Computational Linguistics and Intelligent Text Processing},
YEAR = {2005},
PAGES = {762-773},
PUBLISHER = {Springer}
}