<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
 <record>
  <leader>     caa a22        4500</leader>
  <controlfield tag="001">445379650</controlfield>
  <controlfield tag="003">CHVBK</controlfield>
  <controlfield tag="005">20180317143013.0</controlfield>
  <controlfield tag="007">cr unu---uuuuu</controlfield>
  <controlfield tag="008">170323e20110601xx      s     000 0 eng  </controlfield>
  <datafield tag="024" ind1="7" ind2="0">
   <subfield code="a">10.1007/s10032-010-0135-3</subfield>
   <subfield code="2">doi</subfield>
  </datafield>
  <datafield tag="035" ind1=" " ind2=" ">
   <subfield code="a">(NATIONALLICENCE)springer-10.1007/s10032-010-0135-3</subfield>
  </datafield>
  <datafield tag="245" ind1="0" ind2="0">
   <subfield code="a">Digital weight watching: reconstruction of scanned documents</subfield>
   <subfield code="h">[Elektronische Daten]</subfield>
   <subfield code="c">[Maarten Marx, Tim Gielissen]</subfield>
  </datafield>
  <datafield tag="520" ind1="3" ind2=" ">
   <subfield code="a">A web portal providing access to over 250.000 scanned and OCRed cultural heritage documents is analyzed. The collection consists of the complete Dutch Hansard from 1917 to 1995. Each document consists of facsimile images of the original pages plus hidden OCRed text. The inclusion of images yields large file sizes of which less than 2% is the actual text. The search user interface of the portal provides poor ranking and not very informative document summaries (snippets). Thus, users are instrumental in weeding out non-relevant results. For that, they must assess the complete documents. This is a time-consuming and frustrating process because of long download and processing times of the large files. Instead of using the scanned images for relevance assessment, we propose to use a reconstruction of the original document from a purely semantic representation. Evaluation on the Dutch dataset shows that these reconstructions become two orders of magnitude smaller and still resemble the original to a high degree. In addition, they are easier to speed-read and evaluate for relevance, due to added hyperlinks and a presentation optimized for reading from a terminal. We describe the reconstruction process and evaluate the costs, the benefits, and the quality.</subfield>
  </datafield>
  <datafield tag="540" ind1=" " ind2=" ">
   <subfield code="a">The Author(s), 2010</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">XML</subfield>
   <subfield code="2">nationallicence</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">Information extraction</subfield>
   <subfield code="2">nationallicence</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">Scanned documents</subfield>
   <subfield code="2">nationallicence</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Marx</subfield>
   <subfield code="D">Maarten</subfield>
   <subfield code="u">ISLA, University of Amsterdam, Science Park 107, 1098 XG, Amsterdam, The Netherlands</subfield>
   <subfield code="4">aut</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Gielissen</subfield>
   <subfield code="D">Tim</subfield>
   <subfield code="u">ISLA, University of Amsterdam, Science Park 107, 1098 XG, Amsterdam, The Netherlands</subfield>
   <subfield code="4">aut</subfield>
  </datafield>
  <datafield tag="773" ind1="0" ind2=" ">
   <subfield code="t">International Journal on Document Analysis and Recognition (IJDAR)</subfield>
   <subfield code="d">Springer-Verlag</subfield>
   <subfield code="g">14/2(2011-06-01), 229-239</subfield>
   <subfield code="x">1433-2833</subfield>
   <subfield code="q">14:2&lt;229</subfield>
   <subfield code="1">2011</subfield>
   <subfield code="2">14</subfield>
   <subfield code="o">10032</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://doi.org/10.1007/s10032-010-0135-3</subfield>
   <subfield code="q">text/html</subfield>
   <subfield code="z">Onlinezugriff via DOI</subfield>
  </datafield>
  <datafield tag="908" ind1=" " ind2=" ">
   <subfield code="D">1</subfield>
   <subfield code="a">research-article</subfield>
   <subfield code="2">jats</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="P">856</subfield>
   <subfield code="E">40</subfield>
   <subfield code="u">https://doi.org/10.1007/s10032-010-0135-3</subfield>
   <subfield code="q">text/html</subfield>
   <subfield code="z">Onlinezugriff via DOI</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="P">700</subfield>
   <subfield code="E">1-</subfield>
   <subfield code="a">Marx</subfield>
   <subfield code="D">Maarten</subfield>
   <subfield code="u">ISLA, University of Amsterdam, Science Park 107, 1098 XG, Amsterdam, The Netherlands</subfield>
   <subfield code="4">aut</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="P">700</subfield>
   <subfield code="E">1-</subfield>
   <subfield code="a">Gielissen</subfield>
   <subfield code="D">Tim</subfield>
   <subfield code="u">ISLA, University of Amsterdam, Science Park 107, 1098 XG, Amsterdam, The Netherlands</subfield>
   <subfield code="4">aut</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="P">773</subfield>
   <subfield code="E">0-</subfield>
   <subfield code="t">International Journal on Document Analysis and Recognition (IJDAR)</subfield>
   <subfield code="d">Springer-Verlag</subfield>
   <subfield code="g">14/2(2011-06-01), 229-239</subfield>
   <subfield code="x">1433-2833</subfield>
   <subfield code="q">14:2&lt;229</subfield>
   <subfield code="1">2011</subfield>
   <subfield code="2">14</subfield>
   <subfield code="o">10032</subfield>
  </datafield>
  <datafield tag="900" ind1=" " ind2="7">
   <subfield code="a">Metadata rights reserved</subfield>
   <subfield code="b">Springer special CC-BY-NC licence</subfield>
   <subfield code="2">nationallicence</subfield>
  </datafield>
  <datafield tag="898" ind1=" " ind2=" ">
   <subfield code="a">BK010053</subfield>
   <subfield code="b">XK010053</subfield>
   <subfield code="c">XK010000</subfield>
  </datafield>
  <datafield tag="949" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="F">NATIONALLICENCE</subfield>
   <subfield code="b">NL-springer</subfield>
  </datafield>
 </record>
</collection>
