@inproceedings{Zhu_ICFHR2008, author = { {G}uangyu {Z}hu and {X}iaodong {Y}u and {Y}i {L}i and {D}avid {D}oermann }, abstract = { {W}e propose a novel approach to language identification in document images containing handwriting and machine printed text using image descriptors constructed from a codebook of shape features. {W}e encode local text structures using scale and rotation invariant codewords, each representing a characteristic shape feature that is generic enough to appear repeatably. {W}e learn a concise, structurally indexed shape codebook from training data by clustering similar features and partitioning the feature space by graph cuts. {O}ur approach is segmentation free and easily extensible. {W}e quantitatively evaluate our approach using a large real-world document image collection, which consists of more than 1,500 documents in 8 languages ({A}rabic, {C}hinese, {E}nglish, {H}indi, {J}apanese, {K}orean, {R}ussian, and {T}hai) and contains a complex mixture of handwritten and machine printed content. {E}xperimental results demonstrate the robustness and flexibility of our approach, and show exceptional language identification performance that exceeds the state of art. }, address = { {M}ontreal, {C}anada }, booktitle = { {T}he 11th {I}nternational {C}onference on {F}rontiers in {H}andwritting {R}ecognition ({ICFHR} 2008) }, pages = { 13-18 }, pdffile = { http://lampsrv02.umiacs.umd.edu/pubs/Papers/Zhu_ICFHR2008/Zhu_ICFHR2008.pdf }, title = { {U}nconstrained {L}anguage {I}dentification {U}sing {A} {S}hape {C}odebook }, year = { 2008 } }