@article{Human-Machine:1979,
      recid = {1979},
      author = {Tchoua, Roselyne Barreto},
      title = {Hybrid Human-Machine Scientific Information Extraction},
      publisher = {University of Chicago},
      school = {Ph.D.},
      address = {2019-08},
      pages = {136},
      abstract = {A wealth of valuable research data is locked within the  millions of research articles published every year. Reading  and extracting pertinent information from those articles  has become an unmanageable task for scientists. Moreover,  these data are loosely structured, encoded

in manuscripts  of various formats, embedded in different content types,  and are, in general, not machine accessible. Thus, studies  that automatically leverage this valuable information are  not tractable or even possible. Current approaches employ  humans to manually extract

data, define extraction rules,  or annotate training corpora for machine learning  approaches through tedious, time-consuming, error-prone and  sometimes expensive processes. In the specific case of  scientific information extraction, the need for pointed  expertise increases

costs and decreases the generalization  of extraction methods. This thesis seeks to demonstrate  that efficient combination of human-computer extraction  techniques can considerably alleviate the burden on human  curators, thereby speeding up discovery of new scientific  facts and decreasing extraction costs. This thesis is  investigated in the context of materials informatics, an  emerging field that has the potential to greatly reduce  time-to-market and development costs for new materials.  Such efforts rely on access to large databases of material  properties and therefore represent a suitable but not  unique application for this research. This work addresses  the challenge of populating a database of scientific facts  by presenting three approaches with different levels of  automation and human involvement. Specifically, these three  approaches involve varying amount of untrained, trained and  expert input in order to populate a database of polymer  properties. The first effort, 𝛘DB, engages a semi-expert  crowd to extract an important relation in polymer science.  Here automation is limited, being concerned only with  identifying appropriate elements of scientific articles to  present to crowd members. However, the approach is shown to  accelerate data extraction speed considerably. 𝛘DB is a  crowdsourcing system, which employs and assists a  semi-expert crowd to extract an important relation in  polymer science. Increasing the automation and targeting a  different relation, the Tg approach is a pipeline that uses  a variety of computer and human modules or tasks to  supplement the output of a well-performing natural language  processing software and prioritize expert curation. Having  identified, named scientific named entity recognition as a  major challenge and prerequisite for relations extraction,  polyNER, the third approach uses minimal, focused expert  knowledge to generate annotated entity-rich corpora data  and bootstrap scientific named entities classifiers. This  work shows that systems combining existing software and  minimal human input can achieve performance comparable to  that of a state-of-the-art domain-specific Natural Language  Processing software and demonstrates the potential of  hybrid human-computer partnership alternatives to sometimes  impractical state-of-the-art approaches.},
      url = {http://knowledge.uchicago.edu/record/1979},
      doi = {https://doi.org/10.6082/uchicago.1979},
}