solr_schema.xml

<?xml version="1.0" encoding="UTF-8" ?>
<!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->

<!--  
 This is the Solr schema file. This file should be named "schema.xml" and
 should be in the conf directory under the solr home
 (i.e. ./solr/conf/schema.xml by default) 
 or located where the classloader for the Solr webapp can find it.

 This example schema is the recommended starting point for users.
 It should be kept correct and concise, usable out-of-the-box.

 For more information, on how to customize this file, please see
 http://wiki.apache.org/solr/SchemaXml
-->

<schema name="lpf" version="1.5">
  <!-- attribute "name" is the name of this schema and is only used for display purposes.
       Applications should change this to reflect the nature of the search collection.
       version="1.1" is Solr's version number for the schema syntax and semantics.  It should
       not normally be changed by applications.
       1.0: multiValued attribute did not exist, all fields are multiValued
            by nature
       1.1: multiValued attribute introduced, false by default
       1.2: omitTermFreqAndPositions attribute introduced, true by default
            except for text fields.
       1.3: removed optional field compress feature
       1.4: autoGeneratePhraseQueries attribute introduced to drive QueryParser
            behavior when a single string produces multiple tokens.  Defaults
            to off for version >= 1.4
       1.5: omitNorms defaults to true for primitive field types
            (int, float, boolean, string...)
     -->

  <types>
    <!-- field type definitions. The "name" attribute is
       just a label to be used by field definitions.  The "class"
       attribute and any other attributes determine the real
       behavior of the fieldType.
         Class names starting with "solr" refer to java classes in the
       org.apache.solr.analysis package.
    -->

    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
    <fieldType name="string" class="solr.StrField" sortMissingLast="true" />

    <!-- boolean type: "true" or "false" -->
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>

    <!-- sortMissingLast and sortMissingFirst attributes are optional attributes
         currently supported on types that are sorted internally as strings
         and on numeric types.
             This includes "string","boolean", and, as of 3.5 (and 4.x),
             int, float, long, date, double, including the "Trie" variants.
       - If sortMissingLast="true", then a sort on this field will cause documents
         without the field to come after documents with the field,
         regardless of the requested sort order (asc or desc).
       - If sortMissingFirst="true", then a sort on this field will cause documents
         without the field to come before documents with the field,
         regardless of the requested sort order.
       - If sortMissingLast="false" and sortMissingFirst="false" (the default),
         then default lucene sorting will be used which places docs without the
         field first in an ascending sort and last in a descending sort.
    -->

    <!--
      Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
    -->
    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>

    <!--
     Numeric field types that index each value at various levels of precision
     to accelerate range queries when the number of values between the range
     endpoints is large. See the javadoc for NumericRangeQuery for internal
     implementation details.

     Smaller precisionStep values (specified in bits) will lead to more tokens
     indexed per value, slightly larger index size, and faster range queries.
     A precisionStep of 0 disables indexing at different precision levels.
    -->
    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/>

    <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
         is a more restricted form of the canonical representation of dateTime
         http://www.w3.org/TR/xmlschema-2/#dateTime
         The trailing "Z" designates UTC time and is mandatory.
         Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
         All other components are mandatory.

         Expressions can also be used to denote calculations that should be
         performed relative to "NOW" to determine the value, ie...

               NOW/HOUR
                  ... Round to the start of the current hour
               NOW-1DAY
                  ... Exactly 1 day prior to now
               NOW/DAY+6MONTHS+3DAYS
                  ... 6 months and 3 days in the future from the start of
                      the current day

         Consult the DateField javadocs for more information.

         Note: For faster range queries, consider the tdate type
      -->
    <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>

    <!-- A Trie based date field for faster date range queries and date faceting. -->
    <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>


    <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
    <fieldtype name="binary" class="solr.BinaryField"/>


    <!-- solr.TextField allows the specification of custom text analyzers
         specified as a tokenizer and a list of token filters. Different
         analyzers may be specified for indexing and querying.

         The optional positionIncrementGap puts space between multiple fields of
         this type on the same document, with the purpose of preventing false phrase
         matching across fields.

         For more info on customizing your analyzer chain, please see
         http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
     -->

    <!-- A text field that only splits on whitespace for exact matching of words -->
    <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
      </analyzer>
    </fieldType>

    <!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
        words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
        so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
        Synonyms and stopwords are customized by external files, and stemming is enabled.
        Duplicate tokens at the same position (which may result from Stemmed Synonyms or
        WordDelim parts) are removed.
        -->
    <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <!-- 
          Refer to 'Solr 1.4 Enterprise Search Server', page 56, on why index 
          time synonyms should be used rather than query time. Using synonymy 
          at index and query time is redundant. -Ajay 03/22/2011
        -->
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
		
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
	<filter class="solr.SnowballPorterFilterFactory" protected="protwords.txt" language="English"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
	<filter class="solr.SnowballPorterFilterFactory" protected="protwords.txt" language="English"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>

    <fieldType name="noexp_text" class="solr.TextField" positionIncrementGap="100" >
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>

    <!-- Intended for name searches -->
    <fieldType name="nameType" class="solr.TextField" positionIncrementGap="100" >
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>

    <fieldType name="exact_text" class="solr.TextField" sortMissingLast="true" omitNorms="true">
      <analyzer>
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory" />
        <filter class="solr.TrimFilterFactory" />
      </analyzer>
    </fieldType>

    <!-- since fields of this type are by default not stored or indexed, any data added to 
         them will be ignored outright 
     --> 
    <fieldtype name="ignored" stored="false" indexed="false" class="solr.StrField" /> 

 </types>

 <fields>

   <!-- Valid attributes for fields:
     name: mandatory - the name for the field
     type: mandatory - the name of a field type from the
       <types> fieldType section
     indexed: true if this field should be indexed (searchable or sortable)
     stored: true if this field should be retrievable
     multiValued: true if this field may contain multiple values per document
     omitNorms: (expert) set to true to omit the norms associated with
       this field (this disables length normalization and index-time
       boosting for the field, and saves some memory).  Only full-text
       fields or fields that need an index-time boost need norms.
       Norms are omitted for primitive (non-analyzed) types by default.
     termVectors: [false] set to true to store the term vector for a
       given field.
       When using MoreLikeThis, fields used for similarity should be
       stored for best performance.
     termPositions: Store position information with the term vector.
       This will increase storage costs.
     termOffsets: Store offset information with the term vector. This
       will increase storage costs.
     required: The field is required.  It will throw an error if the
       value does not exist
     default: a value that should be used if no value is specified
       when adding a document.
   -->

   <!-- field names should consist of alphanumeric or underscore characters only and
      not start with a digit.  This is not currently strictly enforced,
      but other field names will not have first class support from all components
      and back compatibility is not guaranteed.  Names with both leading and
      trailing underscores (e.g. _version_) are reserved.
   -->

   <field name="p_uuid" type="string" indexed="false" stored="true" />
   <field name="mass_casualty_id" type="string" indexed="false" stored="true" />
   <field name="triage_category" type="string" indexed="false" stored="true" />

   <field name="hospital" type="noexp_text" indexed="false" stored="true"/>
   <field name="icon_url" type="text" indexed="false" stored="true"/>

   <field name="years_old" type="tint" indexed="false" stored="true" />
   <field name="minAge" type="tint" indexed="false" stored="true" />
   <field name="maxAge" type="tint" indexed="false" stored="true" />
   <field name="ageGroup" type="string" indexed="false" stored="true" />
   
   <field name="opt_status" type="string" indexed="false" stored="true" />
   <field name="opt_gender" type="string" indexed="false" stored="true" />
   <field name="shortname" type="string" indexed="false" stored="true" />
   <field name="updated" type="tdate" indexed="false" stored="true" />
   <field name="last_seen" type="text" indexed="false" stored="true" />
   <field name="comments" type="text" indexed="false" stored="true" />   

   <!-- Set indexed=true for imagestat application. -->
   <field name="image_height" type="tint" indexed="true" stored="true" />
   <field name="image_width" type="tint" indexed="true" stored="true" />

   <!-- Added for imagestat application (PL-449, PL-452, PL-479). -->
   <field name="image_id" type="int" indexed="true" stored="true" />
   <field name="url" type="string" indexed="true" stored="true" />
   <field name="created" type="tdate" indexed="true" stored="true"/>
   <field name="color_channels" type="tint" indexed="true" stored="true" />
   <field name="original_filename" type="string" indexed="false" stored="true" />
   <field name="gt_initial_person" type="string" indexed="true" stored="true" />
   <field name="gt_final_person" type="string" indexed="true" stored="true" />
   <field name="groundTruthStatus" type="int" indexed="true" default="0" stored="true" />
   <field name="initial_regions" type="string" indexed="false" stored="true" />
   <field name="final_regions" type="string" indexed="false" stored="true" />
   <field name="initial_updated_time" type="date" indexed="false" stored="true" />
   <field name="final_updated_time" type="date" indexed="false" stored="true" />
   
   <!-- Added for imagestat application (PL-457). --> 
   <field name="name" type="string" indexed="true" stored="true" />

   <!-- We use this index for filtering out records with images. -->
   <field name="url_thumb" type="text" indexed="true" stored="true" />

   <field name="full_name" type="nameType" indexed="false" stored="true" />
   <field name="family_name" type="nameType" indexed="false" stored="true" />
   <field name="given_name" type="nameType" indexed="false" stored="true" />   
   <field name="alternate_names" type="nameType" indexed="false" stored="true" />   
   <field name="expiry_date" type="tdate" indexed="false" stored="true"/>

   <!-- uncomment the following to ignore any fields that don't already match an existing
        field name or dynamic field, rather than reporting them as an error.
        alternately, change the type="ignored" to some other type e.g. "text" if you want
        unknown fields indexed and/or stored by default -->
   <!-- Ignore all undefined fields. Ajay 03/23/2011 -->
   <dynamicField name="*" type="ignored" multiValued="true" />
 </fields>

 <!-- Field to use to determine and enforce document uniqueness. 
      Unless this field is marked with required="false", it will be a required field
 -->
 <uniqueKey>image_id</uniqueKey>

</schema>