<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
 <record>
  <leader>     nad a22        4500</leader>
  <controlfield tag="001">555163202</controlfield>
  <controlfield tag="005">20190202120452.0</controlfield>
  <controlfield tag="007">cr unu---uuuuu</controlfield>
  <controlfield tag="008">190202s2018    xx      s     000 0 eng  </controlfield>
  <datafield tag="024" ind1="7" ind2="0">
   <subfield code="a">10.5167/uzh-162395</subfield>
   <subfield code="2">doi</subfield>
  </datafield>
  <datafield tag="035" ind1=" " ind2=" ">
   <subfield code="a">(ZORA)oai:www.zora.uzh.ch:162395</subfield>
  </datafield>
  <datafield tag="084" ind1=" " ind2=" ">
   <subfield code="a">000</subfield>
   <subfield code="2">ddc</subfield>
  </datafield>
  <datafield tag="084" ind1=" " ind2=" ">
   <subfield code="a">410</subfield>
   <subfield code="2">ddc</subfield>
  </datafield>
  <datafield tag="100" ind1="1" ind2=" ">
   <subfield code="a">Clematide</subfield>
   <subfield code="D">Simon</subfield>
  </datafield>
  <datafield tag="245" ind1="1" ind2="0">
   <subfield code="a">Crowdsourcing the OCR Ground Truth of a German and French Cultural Heritage Corpus</subfield>
   <subfield code="h">[Elektronische Daten]</subfield>
   <subfield code="c">[Simon Clematide, Lenz Furrer, Martin Volk]</subfield>
  </datafield>
  <datafield tag="506" ind1=" " ind2=" ">
   <subfield code="a">openAccess</subfield>
   <subfield code="2">eu-repo</subfield>
  </datafield>
  <datafield tag="520" ind1="3" ind2=" ">
   <subfield code="a">Crowdsourcing approaches for post-correction of OCR output (Optical Character Recognition) have been successfully applied to several historical text collections. We report on our crowd-correction platform Kokos, which we built to improve the OCR quality of the digitized yearbooks of the Swiss Alpine Club (SAC) from the 19th century. This multilingual heritage corpus consists of Alpine texts mainly written in German and French, all typeset in Antiqua font. Finding and engaging volunteers for correcting large amounts of pages into high quality text requires a carefully designed user interface, an easy-to-use workflow, and continuous efforts for keeping the participants motivated. More than 180,000 characters on about 21,000 pages were corrected by volunteers in about 7 months, achieving an OCR ground truth with a systematically evaluated accuracy of 99.7  on the word level. The crowdsourced OCR ground truth and the corresponding original OCR recognition results from Abbyy FineReader for each page are available as a resource for machine learning and evaluation. Additionally, the scanned images (300 dpi) of all pages are included to enable tests with other OCR software.</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">Institute of Computational Linguistics</subfield>
   <subfield code="2">zora</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">ocr</subfield>
   <subfield code="2">zora</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">crowdsourcing</subfield>
   <subfield code="2">zora</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Furrer</subfield>
   <subfield code="D">Lenz</subfield>
   <subfield code="e">joint author</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Volk</subfield>
   <subfield code="D">Martin</subfield>
   <subfield code="e">joint author</subfield>
  </datafield>
  <datafield tag="773" ind1="0" ind2=" ">
   <subfield code="t">Journal for Language Technology and Computational Linguistics (JLCL)</subfield>
   <subfield code="g">33(1):25-47</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://www.zora.uzh.ch/id/eprint/162395/1/ClematideFurrer2018.pdf</subfield>
   <subfield code="q">text/html</subfield>
   <subfield code="z">WWW-Backlink auf das Repository</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="2">
   <subfield code="z">Onlinezugriff via WWW</subfield>
   <subfield code="q">text/html</subfield>
   <subfield code="u">https://jlcl.org/content/2-allissues/1-heft1-2018/jlcl_2018-1_2.pdf</subfield>
   <subfield code="B">ZORA</subfield>
  </datafield>
  <datafield tag="908" ind1=" " ind2=" ">
   <subfield code="D">1</subfield>
   <subfield code="a">Journal Article</subfield>
   <subfield code="z">PeerReviewed</subfield>
   <subfield code="2">zora</subfield>
  </datafield>
  <datafield tag="909" ind1=" " ind2="7">
   <subfield code="a">SNSF/Projectfunding/CRSII5_173719/CH</subfield>
   <subfield code="2">zora grantAgreement</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">ZORA</subfield>
   <subfield code="P">856</subfield>
   <subfield code="E">40</subfield>
   <subfield code="u">https://www.zora.uzh.ch/id/eprint/162395/1/ClematideFurrer2018.pdf</subfield>
   <subfield code="q">text/html</subfield>
   <subfield code="z">WWW-Backlink auf das Repository</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">ZORA</subfield>
   <subfield code="P">856</subfield>
   <subfield code="E">42</subfield>
   <subfield code="z">Onlinezugriff via WWW</subfield>
   <subfield code="q">text/html</subfield>
   <subfield code="u">https://jlcl.org/content/2-allissues/1-heft1-2018/jlcl_2018-1_2.pdf</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">ZORA</subfield>
   <subfield code="P">100</subfield>
   <subfield code="E">1-</subfield>
   <subfield code="a">Clematide</subfield>
   <subfield code="D">Simon</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">ZORA</subfield>
   <subfield code="P">700</subfield>
   <subfield code="E">1-</subfield>
   <subfield code="a">Furrer</subfield>
   <subfield code="D">Lenz</subfield>
   <subfield code="e">joint author</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">ZORA</subfield>
   <subfield code="P">700</subfield>
   <subfield code="E">1-</subfield>
   <subfield code="a">Volk</subfield>
   <subfield code="D">Martin</subfield>
   <subfield code="e">joint author</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">ZORA</subfield>
   <subfield code="P">773</subfield>
   <subfield code="E">0-</subfield>
   <subfield code="t">Journal for Language Technology and Computational Linguistics (JLCL)</subfield>
   <subfield code="g">33(1):25-47</subfield>
  </datafield>
  <datafield tag="898" ind1=" " ind2=" ">
   <subfield code="a">BK010053</subfield>
   <subfield code="b">XK010053</subfield>
   <subfield code="c">XK010000</subfield>
  </datafield>
  <datafield tag="949" ind1=" " ind2=" ">
   <subfield code="B">ZORA</subfield>
   <subfield code="F">ZORA</subfield>
   <subfield code="b">ZORA</subfield>
   <subfield code="j">Journal Article</subfield>
   <subfield code="c">openAccess</subfield>
  </datafield>
 </record>
</collection>
