<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
 <record>
  <leader>     caa a22        4500</leader>
  <controlfield tag="001">463251377</controlfield>
  <controlfield tag="003">CHVBK</controlfield>
  <controlfield tag="005">20180405153341.0</controlfield>
  <controlfield tag="007">cr unu---uuuuu</controlfield>
  <controlfield tag="008">170326e20071001xx      s     000 0 eng  </controlfield>
  <datafield tag="024" ind1="7" ind2="0">
   <subfield code="a">10.1007/s10791-007-9027-7</subfield>
   <subfield code="2">doi</subfield>
  </datafield>
  <datafield tag="035" ind1=" " ind2=" ">
   <subfield code="a">(NATIONALLICENCE)springer-10.1007/s10791-007-9027-7</subfield>
  </datafield>
  <datafield tag="245" ind1="0" ind2="3">
   <subfield code="a">An empirical study of tokenization strategies for biomedical information retrieval</subfield>
   <subfield code="h">[Elektronische Daten]</subfield>
   <subfield code="c">[Jing Jiang, ChengXiang Zhai]</subfield>
  </datafield>
  <datafield tag="520" ind1="3" ind2=" ">
   <subfield code="a">Due to the great variation of biological names in biomedical text, appropriate tokenization is an important preprocessing step for biomedical information retrieval. Despite its importance, there has been little study on the evaluation of various tokenization strategies for biomedical text. In this work, we conducted a careful, systematic evaluation of a set of tokenization heuristics on all the available TREC biomedical text collections for ad hoc document retrieval, using two representative retrieval methods and a pseudo-relevance feedback method. We also studied the effect of stemming and stop word removal on the retrieval performance. As expected, our experiment results show that tokenization can significantly affect the retrieval accuracy; appropriate tokenization can improve the performance by up to 96%, measured by mean average precision (MAP). In particular, it is shown that different query types require different tokenization heuristics, stemming is effective only for certain queries, and stop word removal in general does not improve the retrieval performance on biomedical text.</subfield>
  </datafield>
  <datafield tag="540" ind1=" " ind2=" ">
   <subfield code="a">Springer Science+Business Media, LLC, 2007</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">Biomedical information retrieval</subfield>
   <subfield code="2">nationallicence</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">Tokenization</subfield>
   <subfield code="2">nationallicence</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">Stemming</subfield>
   <subfield code="2">nationallicence</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2="7">
   <subfield code="a">Stop word</subfield>
   <subfield code="2">nationallicence</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Jiang</subfield>
   <subfield code="D">Jing</subfield>
   <subfield code="u">Department of Computer Science, University of Illinois at Urbana-Champaign, 201 N Goodwin Ave, 61801, Urbana, IL, USA</subfield>
   <subfield code="4">aut</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Zhai</subfield>
   <subfield code="D">ChengXiang</subfield>
   <subfield code="u">Department of Computer Science, University of Illinois at Urbana-Champaign, 201 N Goodwin Ave, 61801, Urbana, IL, USA</subfield>
   <subfield code="4">aut</subfield>
  </datafield>
  <datafield tag="773" ind1="0" ind2=" ">
   <subfield code="t">Information Retrieval</subfield>
   <subfield code="d">Springer Netherlands</subfield>
   <subfield code="g">10/4-5(2007-10-01), 341-363</subfield>
   <subfield code="x">1386-4564</subfield>
   <subfield code="q">10:4-5&lt;341</subfield>
   <subfield code="1">2007</subfield>
   <subfield code="2">10</subfield>
   <subfield code="o">10791</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://doi.org/10.1007/s10791-007-9027-7</subfield>
   <subfield code="q">text/html</subfield>
   <subfield code="z">Onlinezugriff via DOI</subfield>
  </datafield>
  <datafield tag="908" ind1=" " ind2=" ">
   <subfield code="D">1</subfield>
   <subfield code="a">research-article</subfield>
   <subfield code="2">jats</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="P">856</subfield>
   <subfield code="E">40</subfield>
   <subfield code="u">https://doi.org/10.1007/s10791-007-9027-7</subfield>
   <subfield code="q">text/html</subfield>
   <subfield code="z">Onlinezugriff via DOI</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="P">700</subfield>
   <subfield code="E">1-</subfield>
   <subfield code="a">Jiang</subfield>
   <subfield code="D">Jing</subfield>
   <subfield code="u">Department of Computer Science, University of Illinois at Urbana-Champaign, 201 N Goodwin Ave, 61801, Urbana, IL, USA</subfield>
   <subfield code="4">aut</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="P">700</subfield>
   <subfield code="E">1-</subfield>
   <subfield code="a">Zhai</subfield>
   <subfield code="D">ChengXiang</subfield>
   <subfield code="u">Department of Computer Science, University of Illinois at Urbana-Champaign, 201 N Goodwin Ave, 61801, Urbana, IL, USA</subfield>
   <subfield code="4">aut</subfield>
  </datafield>
  <datafield tag="950" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="P">773</subfield>
   <subfield code="E">0-</subfield>
   <subfield code="t">Information Retrieval</subfield>
   <subfield code="d">Springer Netherlands</subfield>
   <subfield code="g">10/4-5(2007-10-01), 341-363</subfield>
   <subfield code="x">1386-4564</subfield>
   <subfield code="q">10:4-5&lt;341</subfield>
   <subfield code="1">2007</subfield>
   <subfield code="2">10</subfield>
   <subfield code="o">10791</subfield>
  </datafield>
  <datafield tag="900" ind1=" " ind2="7">
   <subfield code="a">Metadata rights reserved</subfield>
   <subfield code="b">Springer special CC-BY-NC licence</subfield>
   <subfield code="2">nationallicence</subfield>
  </datafield>
  <datafield tag="898" ind1=" " ind2=" ">
   <subfield code="a">BK010053</subfield>
   <subfield code="b">XK010053</subfield>
   <subfield code="c">XK010000</subfield>
  </datafield>
  <datafield tag="949" ind1=" " ind2=" ">
   <subfield code="B">NATIONALLICENCE</subfield>
   <subfield code="F">NATIONALLICENCE</subfield>
   <subfield code="b">NL-springer</subfield>
  </datafield>
 </record>
</collection>
