From 54a83db68ae07063738e1c31cad6ae0ab37f2dd0 Mon Sep 17 00:00:00 2001 From: Karthik Gururaj Date: Mon, 20 Jun 2016 22:23:40 -0700 Subject: [PATCH] Workaround BCF2 limitations in htsjdk - bcf_str_missing is defined to be '\0' in htjdk, the standard uses it to represent vector end and 0x7 for missing chars --- include/query_operations/variant_operations.h | 6 ++++-- src/genomicsdb/variant_field_handler.cc | 6 ++++-- src/query_operations/broad_combined_gvcf.cc | 8 ++++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/query_operations/variant_operations.h b/include/query_operations/variant_operations.h index f62971fd..05cbca10 100644 --- a/include/query_operations/variant_operations.h +++ b/include/query_operations/variant_operations.h @@ -181,7 +181,8 @@ class VariantFieldHandlerBase virtual bool compute_valid_element_wise_sum(const Variant& variant, const VariantQueryConfig& query_config, unsigned query_idx, void* output_ptr, unsigned num_elements) = 0; virtual bool collect_and_extend_fields(const Variant& variant, const VariantQueryConfig& query_config, - unsigned query_idx, const void ** output_ptr, unsigned& num_elements, const bool use_missing_values_only_not_vector_end=false) = 0; + unsigned query_idx, const void ** output_ptr, unsigned& num_elements, + const bool use_missing_values_only_not_vector_end=false, const bool use_vector_end_only=false) = 0; }; //Big bag handler functions useful for handling different types of fields (int, char etc) @@ -225,7 +226,8 @@ class VariantFieldHandler : public VariantFieldHandlerBase * Create an extended vector for use in BCF format fields, return result in output_ptr and num_elements */ bool collect_and_extend_fields(const Variant& variant, const VariantQueryConfig& query_config, - unsigned query_idx, const void ** output_ptr, unsigned& num_elements, const bool use_missing_values_only_not_vector_end=false); + unsigned query_idx, const void ** output_ptr, unsigned& num_elements, + const bool use_missing_values_only_not_vector_end=false, const bool use_vector_end_only=false); private: std::vector m_num_calls_with_valid_data; DataType m_bcf_missing_value; diff --git a/src/genomicsdb/variant_field_handler.cc b/src/genomicsdb/variant_field_handler.cc index 88c0ca24..094559ae 100644 --- a/src/genomicsdb/variant_field_handler.cc +++ b/src/genomicsdb/variant_field_handler.cc @@ -262,7 +262,8 @@ bool VariantFieldHandler::compute_valid_element_wise_sum(const Variant template bool VariantFieldHandler::collect_and_extend_fields(const Variant& variant, const VariantQueryConfig& query_config, - unsigned query_idx, const void ** output_ptr, unsigned& num_elements, const bool use_missing_values_only_not_vector_end) + unsigned query_idx, const void ** output_ptr, unsigned& num_elements, + const bool use_missing_values_only_not_vector_end, const bool use_vector_end_only) { auto max_elements_per_call = 0u; auto valid_idx = 0u; @@ -301,7 +302,8 @@ bool VariantFieldHandler::collect_and_extend_fields(const Variant& var } if(num_elements_inserted == 0u) //no elements inserted, insert missing value first { - m_extended_field_vector[extended_field_vector_idx] = get_bcf_missing_value(); + m_extended_field_vector[extended_field_vector_idx] = use_vector_end_only ? get_bcf_vector_end_value() + : get_bcf_missing_value(); ++num_elements_inserted; ++extended_field_vector_idx; } diff --git a/src/query_operations/broad_combined_gvcf.cc b/src/query_operations/broad_combined_gvcf.cc index 4ee89c01..620e83b5 100644 --- a/src/query_operations/broad_combined_gvcf.cc +++ b/src/query_operations/broad_combined_gvcf.cc @@ -243,6 +243,7 @@ void BroadCombinedGVCFOperator::handle_FORMAT_fields(const Variant& variant) auto known_field_enum = BCF_FORMAT_GET_KNOWN_FIELD_ENUM(curr_tuple); assert(known_field_enum < g_known_variant_field_names.size()); auto variant_type_enum = BCF_FORMAT_GET_VARIANT_FIELD_TYPE_ENUM(curr_tuple); + auto is_char_type = (variant_type_enum == VARIANT_FIELD_CHAR); //valid field handler assert(variant_type_enum < m_field_handlers.size() && m_field_handlers[variant_type_enum].get()); //Check if this is a field that was remapped - for remapped fields, we must use field objects from m_remapped_variant @@ -250,7 +251,8 @@ void BroadCombinedGVCFOperator::handle_FORMAT_fields(const Variant& variant) auto query_field_idx = m_query_config->get_query_idx_for_known_field_enum(known_field_enum); auto& src_variant = (m_remapping_needed && KnownFieldInfo::is_length_allele_dependent(known_field_enum)) ? m_remapped_variant : variant; auto valid_field_found = m_field_handlers[variant_type_enum]->collect_and_extend_fields(src_variant, *m_query_config, - query_field_idx, &ptr, num_elements, m_use_missing_values_not_vector_end); + query_field_idx, &ptr, num_elements, + m_use_missing_values_not_vector_end && !is_char_type, m_use_missing_values_not_vector_end && is_char_type); if(valid_field_found) { auto j=0u; @@ -326,10 +328,12 @@ void BroadCombinedGVCFOperator::handle_FORMAT_fields(const Variant& variant) auto& curr_tuple = m_unknown_FORMAT_fields_vec[i]; auto query_field_idx = BCF_FORMAT_GET_QUERY_FIELD_IDX(curr_tuple); auto variant_type_enum = BCF_FORMAT_GET_VARIANT_FIELD_TYPE_ENUM(curr_tuple); + auto is_char_type = (variant_type_enum == VARIANT_FIELD_CHAR); //valid field handler assert(variant_type_enum < m_field_handlers.size() && m_field_handlers[variant_type_enum].get()); auto valid_field_found = m_field_handlers[variant_type_enum]->collect_and_extend_fields(variant, *m_query_config, - query_field_idx, &ptr, num_elements, m_use_missing_values_not_vector_end); + query_field_idx, &ptr, num_elements, + m_use_missing_values_not_vector_end && !is_char_type, m_use_missing_values_not_vector_end && is_char_type); if(valid_field_found) bcf_update_format(m_vcf_hdr, m_bcf_out, m_query_config->get_query_attribute_name(query_field_idx).c_str(), ptr, num_elements, BCF_FORMAT_GET_BCF_HT_TYPE(curr_tuple));