@article{LanguageID_PR2009, author = { {G}uangyu {Z}hu and {X}iaodong {Y}u and {Y}i {L}i and {D}avid {D}oermann }, abstract = { {L}anguage identification for handwritten document images is an open document analysis problem. {I}n this paper, we propose a novel approach to language identification for documents containing mixture of handwritten and machine printed text using image descriptors constructed from a codebook of shape features. {W}e encode local text structures using scale and rotation invariant codewords, each representing a segmentation-free shape feature that is generic enough to be detected repeatably. {W}e learn a concise, structurally indexed shape codebook from training by clustering and partitioning similar feature types through graph cuts. {O}ur approach is easily extensible and does not require skew correction, scale normalization, or segmentation. {W}e quantitatively evaluate our approach using a large real-world document image collection, which is composed of 1,512 documents in eight languages ({A}rabic, {C}hinese, {E}nglish, {H}indi, {J}apanese, {K}orean, {R}ussian, and {T}hai) and contains a complex mixture of handwritten and machine printed content. {E}xperiments demonstrate the robustness and flexibility of our approach, and show exceptional language identification performance that exceeds the state of the art. }, journal = { {P}attern {R}ecognition }, month = { {D}ecember }, pages = { 3184-3191 }, pdffile = { http://lampsrv02.umiacs.umd.edu/pubs/Papers/LanguageID_PR2009/LanguageID_PR2009.pdf }, title = { {L}anguage {I}dentification for {H}andwritten {D}ocument {I}mages {U}sing {A} {S}hape {C}odebook }, volume = { 42 }, year = { 2009 } }