|
Needle
An application for fast and efficient searches of NGS data.
|
#include <algorithm>#include <chrono>#include <deque>#include <filesystem>#include <iostream>#include <math.h>#include <mutex>#include <numeric>#include <omp.h>#include <ranges>#include <string>#include <thread>#include <robin_hood.h>#include <seqan3/alphabet/container/concatenated_sequences.hpp>#include <seqan3/alphabet/nucleotide/dna4.hpp>#include <seqan3/contrib/parallel/buffer_queue.hpp>#include <seqan3/core/concept/cereal.hpp>#include <seqan3/io/sequence_file/all.hpp>#include <seqan3/io/stream/detail/fast_istreambuf_iterator.hpp>#include <seqan3/utility/container/dynamic_bitset.hpp>#include <seqan3/utility/parallel/detail/latch.hpp>#include "ibf.hpp"#include "shared.hpp"Functions | |
| void | get_include_set_table (min_arguments const &args, std::filesystem::path const include_file, robin_hood::unordered_set< uint64_t > &include_table) |
| bool | check_for_fasta_format (std::vector< std::string > const &valid_extensions, std::string const &file_path) |
| uint8_t | calculate_cutoff (std::filesystem::path sequence_file, int samples) |
| void | fill_hash_table (min_arguments const &args, seqan3::sequence_file_input< my_traits, seqan3::fields< seqan3::field::seq > > &fin, robin_hood::unordered_node_map< uint64_t, uint16_t > &hash_table, robin_hood::unordered_node_map< uint64_t, uint8_t > &cutoff_table, robin_hood::unordered_set< uint64_t > const &include_set_table, robin_hood::unordered_set< uint64_t > const &exclude_set_table, bool const only_include=false, uint8_t cutoff=0) |
| void | fill_hash_table_parallel (min_arguments const &args, seqan3::sequence_file_input< my_traits, seqan3::fields< seqan3::field::seq > > &fin, robin_hood::unordered_node_map< uint64_t, uint16_t > &hash_table, robin_hood::unordered_node_map< uint64_t, uint8_t > &cutoff_table, robin_hood::unordered_set< uint64_t > const &include_set_table, robin_hood::unordered_set< uint64_t > const &exclude_set_table, bool const only_include=false, uint8_t cutoff=0) |
| void | count_genome (min_arguments const &args, std::filesystem::path include_file, std::filesystem::path exclude_file) |
| Creates a set of minimizers to ignore, which should be used as an input to count. | |
| void | count (min_arguments const &args, std::vector< std::filesystem::path > sequence_files, std::filesystem::path include_file, std::filesystem::path genome_file, bool paired) |
| Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available. | |
| void | read_binary (std::filesystem::path filename, robin_hood::unordered_node_map< uint64_t, uint16_t > &hash_table) |
| Reads a binary file that needle minimiser creates. | |
| void | read_binary_start (min_arguments &args, std::filesystem::path filename, uint64_t &num_of_minimisers, uint8_t &cutoff) |
| Reads the beginning of a binary file that needle minimiser creates. | |
| void | check_expression (std::vector< uint16_t > &expression_thresholds, uint8_t &number_expression_thresholds, std::filesystem::path const expression_by_genome_file) |
| void | check_cutoffs_samples (std::vector< std::filesystem::path > const &sequence_files, bool const paired, std::vector< int > &samples, std::vector< uint8_t > &cutoffs) |
| void | check_fpr (uint8_t const number_expression_thresholds, std::vector< double > &fprs) |
| void | get_expression_thresholds (uint8_t const number_expression_thresholds, robin_hood::unordered_node_map< uint64_t, uint16_t > const &hash_table, std::vector< uint16_t > &expression_thresholds, std::vector< uint64_t > &sizes, robin_hood::unordered_set< uint64_t > const &genome, uint8_t const cutoff, bool const all=true) |
| void | get_filsize_per_expression_level (std::filesystem::path filename, uint8_t const number_expression_thresholds, std::vector< uint16_t > const &expression_thresholds, std::vector< uint64_t > &sizes, robin_hood::unordered_set< uint64_t > const &genome, bool all=true) |
| template<bool samplewise, bool minimiser_files_given = true> | |
| void | ibf_helper (std::vector< std::filesystem::path > const &minimiser_files, std::vector< double > const &fprs, estimate_ibf_arguments &ibf_args, std::vector< uint8_t > &cutoffs, size_t num_hash=1, std::filesystem::path expression_by_genome_file="", minimiser_arguments const &minimiser_args={}) |
| std::vector< uint16_t > | ibf (std::vector< std::filesystem::path > const &sequence_files, estimate_ibf_arguments &ibf_args, minimiser_arguments &minimiser_args, std::vector< double > &fpr, std::vector< uint8_t > &cutoffs, std::filesystem::path const expression_by_genome_file, size_t num_hash) |
| Creates IBFs. | |
| std::vector< uint16_t > | ibf (std::vector< std::filesystem::path > const &minimiser_files, estimate_ibf_arguments &ibf_args, std::vector< double > &fpr, std::filesystem::path const expression_by_genome_file, size_t num_hash) |
| Creates IBFs based on the minimiser files. | |
| template<typename float_or_int > | |
| void | read_levels (std::vector< std::vector< float_or_int > > &expressions, std::filesystem::path filename) |
| template<bool samplewise, bool minimiser_files_given = true> | |
| void | insert_helper (std::vector< std::filesystem::path > const &minimiser_files, estimate_ibf_arguments &ibf_args, std::filesystem::path path_in, std::vector< uint8_t > &cutoffs, std::filesystem::path expression_by_genome_file="", minimiser_arguments const &minimiser_args={}) |
| std::vector< uint16_t > | insert (std::vector< std::filesystem::path > const &sequence_files, estimate_ibf_arguments &ibf_args, minimiser_arguments &minimiser_args, std::vector< uint8_t > &cutoffs, std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise) |
| Insert into IBFs. | |
| std::vector< uint16_t > | insert (std::vector< std::filesystem::path > const &minimiser_files, estimate_ibf_arguments &ibf_args, std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise) |
| Insert into IBFs based on the minimiser files. | |
| void | delete_bin (std::vector< uint64_t > const &delete_files, estimate_ibf_arguments &ibf_args, std::filesystem::path path_in, bool samplewise) |
| Delete bins from ibfs. | |
| template<bool parallel = false> | |
| void | calculate_minimiser (std::vector< std::filesystem::path > const &sequence_files, robin_hood::unordered_set< uint64_t > const &include_set_table, robin_hood::unordered_set< uint64_t > const &exclude_set_table, min_arguments const &args, minimiser_arguments const &minimiser_args, unsigned const i, std::vector< uint8_t > &cutoffs) |
| void | minimiser (std::vector< std::filesystem::path > const &sequence_files, min_arguments const &args, minimiser_arguments &minimiser_args, std::vector< uint8_t > &cutoffs) |
| Create minimiser and header files. | |
| uint8_t calculate_cutoff | ( | std::filesystem::path | sequence_file, |
| int | samples | ||
| ) |
| void calculate_minimiser | ( | std::vector< std::filesystem::path > const & | sequence_files, |
| robin_hood::unordered_set< uint64_t > const & | include_set_table, | ||
| robin_hood::unordered_set< uint64_t > const & | exclude_set_table, | ||
| min_arguments const & | args, | ||
| minimiser_arguments const & | minimiser_args, | ||
| unsigned const | i, | ||
| std::vector< uint8_t > & | cutoffs | ||
| ) |
| void check_cutoffs_samples | ( | std::vector< std::filesystem::path > const & | sequence_files, |
| bool const | paired, | ||
| std::vector< int > & | samples, | ||
| std::vector< uint8_t > & | cutoffs | ||
| ) |
| void check_expression | ( | std::vector< uint16_t > & | expression_thresholds, |
| uint8_t & | number_expression_thresholds, | ||
| std::filesystem::path const | expression_by_genome_file | ||
| ) |
|
inline |
| void check_fpr | ( | uint8_t const | number_expression_thresholds, |
| std::vector< double > & | fprs | ||
| ) |
| void count | ( | min_arguments const & | args, |
| std::vector< std::filesystem::path > | sequence_files, | ||
| std::filesystem::path | include_file, | ||
| std::filesystem::path | genome_file, | ||
| bool | paired | ||
| ) |
Get the concrete expression values (= median of all counts of one transcript) for given experiments. This function can be used to estimate how good the median approach can be, if all count values are available.
| args | The minimiser arguments to use (seed, shape, window size). |
| sequence_files | The sequence files, which contains the reads. |
| include_file | A file containing the transcripts which expression values should be determined. |
| genome_file | A "*.genome" file constructed with the command genome. |
| paired | Flag to indicate if input data is paired or not. |
| void count_genome | ( | min_arguments const & | args, |
| std::filesystem::path | include_file, | ||
| std::filesystem::path | exclude_file | ||
| ) |
Creates a set of minimizers to ignore, which should be used as an input to count.
| args | The minimiser arguments to use (seed, shape, window size). |
| include_file | A file containing the transcripts which expression values should be determined. |
| exclude_file | A file containing minimizers which should be ignored. |
| void delete_bin | ( | std::vector< uint64_t > const & | delete_files, |
| estimate_ibf_arguments & | ibf_args, | ||
| std::filesystem::path | path_in, | ||
| bool | samplewise | ||
| ) |
Delete bins from ibfs.
| delete_files | A vector of integers specifiying the bins to delete. |
| ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
| path_in | Input directory. |
| samplewise | True, if expression levels were set beforehand. |
| void fill_hash_table | ( | min_arguments const & | args, |
| seqan3::sequence_file_input< my_traits, seqan3::fields< seqan3::field::seq > > & | fin, | ||
| robin_hood::unordered_node_map< uint64_t, uint16_t > & | hash_table, | ||
| robin_hood::unordered_node_map< uint64_t, uint8_t > & | cutoff_table, | ||
| robin_hood::unordered_set< uint64_t > const & | include_set_table, | ||
| robin_hood::unordered_set< uint64_t > const & | exclude_set_table, | ||
| bool const | only_include = false, |
||
| uint8_t | cutoff = 0 |
||
| ) |
| void fill_hash_table_parallel | ( | min_arguments const & | args, |
| seqan3::sequence_file_input< my_traits, seqan3::fields< seqan3::field::seq > > & | fin, | ||
| robin_hood::unordered_node_map< uint64_t, uint16_t > & | hash_table, | ||
| robin_hood::unordered_node_map< uint64_t, uint8_t > & | cutoff_table, | ||
| robin_hood::unordered_set< uint64_t > const & | include_set_table, | ||
| robin_hood::unordered_set< uint64_t > const & | exclude_set_table, | ||
| bool const | only_include = false, |
||
| uint8_t | cutoff = 0 |
||
| ) |
| void get_expression_thresholds | ( | uint8_t const | number_expression_thresholds, |
| robin_hood::unordered_node_map< uint64_t, uint16_t > const & | hash_table, | ||
| std::vector< uint16_t > & | expression_thresholds, | ||
| std::vector< uint64_t > & | sizes, | ||
| robin_hood::unordered_set< uint64_t > const & | genome, | ||
| uint8_t const | cutoff, | ||
| bool const | all = true |
||
| ) |
| void get_filsize_per_expression_level | ( | std::filesystem::path | filename, |
| uint8_t const | number_expression_thresholds, | ||
| std::vector< uint16_t > const & | expression_thresholds, | ||
| std::vector< uint64_t > & | sizes, | ||
| robin_hood::unordered_set< uint64_t > const & | genome, | ||
| bool | all = true |
||
| ) |
| void get_include_set_table | ( | min_arguments const & | args, |
| std::filesystem::path const | include_file, | ||
| robin_hood::unordered_set< uint64_t > & | include_table | ||
| ) |
| std::vector< uint16_t > ibf | ( | std::vector< std::filesystem::path > const & | minimiser_files, |
| estimate_ibf_arguments & | ibf_args, | ||
| std::vector< double > & | fpr, | ||
| std::filesystem::path const | expression_by_genome_file = "", |
||
| size_t | num_hash = 1 |
||
| ) |
Creates IBFs based on the minimiser files.
| minimiser_files | A vector of minimiser file paths. |
| ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
| fpr | The average false positive rate that should be used. |
| expression_by_genome_file | File that contains the only minimisers that should be comnsidered for the determination of the expression_thresholds. |
| num_hash | The number of hash functions to use. |
| std::vector< uint16_t > ibf | ( | std::vector< std::filesystem::path > const & | sequence_files, |
| estimate_ibf_arguments & | ibf_args, | ||
| minimiser_arguments & | minimiser_args, | ||
| std::vector< double > & | fpr, | ||
| std::vector< uint8_t > & | cutoffs, | ||
| std::filesystem::path const | expression_by_genome_file = "", |
||
| size_t | num_hash = 1 |
||
| ) |
Creates IBFs.
| sequence_files | A vector of sequence file paths. |
| ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
| minimiser_args | The minimiser specific arguments to use. |
| fpr | The average false positive rate that should be used. |
| cutoffs | List of cutoffs. |
| expression_by_genome_file | File that contains the only minimisers that should be considered for the determination of the expression thresholds. |
| num_hash | The number of hash functions to use. |
| void ibf_helper | ( | std::vector< std::filesystem::path > const & | minimiser_files, |
| std::vector< double > const & | fprs, | ||
| estimate_ibf_arguments & | ibf_args, | ||
| std::vector< uint8_t > & | cutoffs, | ||
| size_t | num_hash = 1, |
||
| std::filesystem::path | expression_by_genome_file = "", |
||
| minimiser_arguments const & | minimiser_args = {} |
||
| ) |
| std::vector< uint16_t > insert | ( | std::vector< std::filesystem::path > const & | minimiser_files, |
| estimate_ibf_arguments & | ibf_args, | ||
| std::filesystem::path const | expression_by_genome_file, | ||
| std::filesystem::path | path_in, | ||
| bool | samplewise | ||
| ) |
Insert into IBFs based on the minimiser files.
| minimiser_files | A vector of minimiser file paths. |
| ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
| expression_by_genome_file | File that contains the only minimisers that should be comnsidered for the determination of the expression_thresholds. |
| path_in | Input directory. |
| samplewise | True, if expression levels were set beforehand. |
| std::vector< uint16_t > insert | ( | std::vector< std::filesystem::path > const & | sequence_files, |
| estimate_ibf_arguments & | ibf_args, | ||
| minimiser_arguments & | minimiser_args, | ||
| std::vector< uint8_t > & | cutoffs, | ||
| std::filesystem::path const | expression_by_genome_file, | ||
| std::filesystem::path | path_in, | ||
| bool | samplewise | ||
| ) |
Insert into IBFs.
| sequence_files | A vector of sequence file paths. |
| ibf_args | The IBF specific arguments to use (bin size, number of hash functions, ...). See struct ibf_arguments. |
| minimiser_args | The minimiser specific arguments to use. |
| cutoffs | List of cutoffs. |
| expression_by_genome_file | File that contains the only minimisers that should be considered for the determination of the expression thresholds. |
| path_in | Input directory. |
| samplewise | True, if expression levels were set beforehand. |
| void insert_helper | ( | std::vector< std::filesystem::path > const & | minimiser_files, |
| estimate_ibf_arguments & | ibf_args, | ||
| std::filesystem::path | path_in, | ||
| std::vector< uint8_t > & | cutoffs, | ||
| std::filesystem::path | expression_by_genome_file = "", |
||
| minimiser_arguments const & | minimiser_args = {} |
||
| ) |
| void minimiser | ( | std::vector< std::filesystem::path > const & | sequence_files, |
| min_arguments const & | args, | ||
| minimiser_arguments & | minimiser_args, | ||
| std::vector< uint8_t > & | cutoffs | ||
| ) |
Create minimiser and header files.
| sequence_files | A vector of sequence file paths. |
| args | The minimiser arguments to use (seed, shape, window size). |
| minimiser_args | The minimiser specific arguments to use. |
| cutoffs | List of cutoffs. |
| void read_binary | ( | std::filesystem::path | filename, |
| robin_hood::unordered_node_map< uint64_t, uint16_t > & | hash_table | ||
| ) |
Reads a binary file that needle minimiser creates.
| filename | The filename of the binary file. |
| hash_table | The hash table to store minimisers into. |
| void read_binary_start | ( | min_arguments & | args, |
| std::filesystem::path | filename, | ||
| uint64_t & | num_of_minimisers, | ||
| uint8_t & | cutoff | ||
| ) |
Reads the beginning of a binary file that needle minimiser creates.
| args | Min arguments. |
| filename | The filename of the binary file. |
| num_of_minimisers | Variable, where to number of minimisers should be stored. |
| cutoff | cutoff value. |
| void read_levels | ( | std::vector< std::vector< float_or_int > > & | expressions, |
| std::filesystem::path | filename | ||
| ) |