Example Configuration File

### DART-ID configuration
### =========================

# List the input files here, or define them on the command line
# when running the tool.
# for example, dart_id -i /path/to/input1.txt /path/to/input2.txt
input:
  - /path/to/dat/FP061A/evidence.txt
  - /path/to/dat/FP062ABCD/evidence.txt
  - /path/to/dat/FP063/evidence.txt
  - /path/to/dat/FP064AG/evidence.txt


# Folder to output to. The updated evidence file, as well as the
# optional parameters files and figures will be deposited here
# this can also be specified on the command line. e.g.:
# -o /path/to/output/folder
#output: /path/to/output/folder

# Print diagnostic figures, as well as an HTML file that allows
# for quick browsing
print_figures: true

## Input Type Options
## ==========================

# column names of the input file
# as of now all input files have to be the same format
# change these as the input file changes,
# e.g., when a different search engine or search engine configuration is used
col_names:
  # These four columns are required. This program will not work without them.

  # Sequence can be the canonical amino-acid sequence, 
  # or the modified/annotated sequence, as provided by the search engine
  sequence: Modified sequence
  # The name of the raw/spectrum file, or a unique identifier for each
  # mass-spec run
  raw_file: Raw file
  # The retention/elution time of the ion, in minutes
  # This can also be in seconds, just make sure you update the priors in
  # model to reflect this change.
  retention_time: Retention time
  # The error probability of the peptide-spectrum-match. can be provided
  # by the search engine or by a separate program, e.g., Percolator
  pep: PEP

  # optional columns, that would be used for filtering or figure generation

  # Used to (optionally) append the ion charge state to the peptide sequence, 
  # so that peptides with different charge states are treated as different
  # peptide species.
  charge: Charge

  # Used to run the Fido protein inference algorithm
  leading_protein: Leading razor protein
  proteins: Proteins

  # The base peak width, i.e., the time range between when an ion 
  # first elutes to when it last elutes. Use this as a quality score 
  # in order to filter out poorly retained ions.
  retention_length: Retention length

  # Unused

  #intensity: Intensity
  #leading_gene: ~
  #genes: ~
  #exclude: ~
  #exp_id: ~
  #peptide_id: ~

## PSM Filters
## ======================

# Filters are used to exclude certain observations (PSMs) from
# the alignment process.
# Remove/comment-out filters from this list that you do not want to have.
filters:
  # Filter out entire raw files, especially if they are of a different run-time,
  # or if the LC for that experiment was problematic. The "expr" field is a regular
  # expression that will be checked against all raw files in the input.
  #- name: exclude_filename 
  #  expr: PS06[1-3][AB]|PS064F

  # Same as above, but as a whitelist instead of a blacklist
  #- name: include_filename
  #  expr: 2018A

  # Provide an exclusion list of UniProt IDs. Any PSM matching this
  # list will be filtered out
  # Either a file, with UniProt IDs separated by line breaks, can be
  # specified with the "file" field, or
  # a list of UniProt IDs can be provided in the "list" field
  #- name: uniprot_exclusion
  #  file: /path/to/list_of_uniprot_ids.txt
  #  list:
  #    - or_you_could
  #    - list_uniprot_ids_here
  #    - P36578
  #    - Q99797

  # Filter out contaminants as marked by the search engine
  # The "tag" option is the pattern used to filter out PSMs
  - name: contaminant
    tag: CON__

  # Filter out decoys as marked by the search engine
  # The "tag" option is the pattern used to filter out PSMs
  # - name: decoy
  #   tag: REV__

  # Filter out PSMs by the retention length, which is defined
  # by some search engines as the time at which this spectra is first
  # observed, to the time this spectra is last observed
  # 
  # If "dynamic" is set to true, then the threshold is a fraction of
  # the maximum RT for that raw file (i.e., the run-time). A value of 0.01
  # denotes that the threshold will be 1% of the total run-time of the experiment.
  - name: retention_length
    dynamic: true
    value: 0.01667

  # Filter out PSMs by their RT ranges in each experiment. This behavior is
  # similar but not exactly the same as the "retention_length" filter.
  # 
  # If "dynamic" is set to true, then the threshold is a fraction of
  # the maximum RT for that raw file (i.e., the run-time). A value of 0.01
  # denotes that the threshold will be 1% of the total run-time of the experiment.
  - name: smears
    dynamic: true
    value: 0.03333



### =======================
### !! ADVANCED SETTINGS !!
### =======================

# Only edit the following settings if you understand their effects
# Please refer to config_annotated.yaml for detailed descriptions for
# each configuration field

# Level of verbosity. Higher numbers = printing more information
# 0 = ERROR
# 1 = WARNING (default)
# 2 = INFO
# 3 = DEBUG
# verbose: 1


## Input
## ==========================

# Column delimiter of the input files. i.e., ',' for CSV, '\t' for tabular
# sep: \t

# The input data is loaded in with pandas, and it doesn't like
# some columns being mostly empty. This needs to be set to false 
# for input formats like MaxQuant.
# low_memory: false

# Instead of running a new STAN alignment, use a set of parameters
# from a previous run. The folder needs to include the three files
# outputted from a run with the "save_params" option on, and this run
# needs to be run with the exact same filters as that previous run.
# (exp_params.txt, peptide_params.txt, pair_params.txt)
# params_folder: /path/to/output_folder_from_prev_run

## Alignment Options
## ==========================

# Which alignment model to use
# Options: 'linear', 'two_piece_linear', 'two_piece_linear_laplace'
# model: 'two_piece_linear_laplace'

# add charge of ion onto the sequence, so that sequences ionized
# with different charge states will be aligned separately.
# 
# Sometimes peptide sequences will form chemical adducts on column
# that can reflect on the charge received by the peptide during the
# ioniziation process, and aligning differently charged peptides can
# account for these chromatographic changes
# add_charge_to_sequence: false

# Number of iterations to run when generating priors
# If the average error when generating priors is too high,
# or prohibitive for STAN, then increase these to get more accurate priors
# prior_iters: 10

# Number of iterations to run for STAN. If STAN is consistently hitting
# its iteration limit without reaching an optima it is happy with,
# then increase this number
# stan_iters: 20000

## Advanced Alignment Options

# Minimum value for mu, a canonical retention time (RT) for a peptide
# mu_min: 1

# Amount to distort RTs when calculating priors. If STAN is erroring out
# because the priors are already too close to the optima, then consider
# slowly increasing this value to give STAN more room to iterate.
# rt_distortion: 0


# Advanced STAN parameters (with cmdstan: https://mc-stan.org/users/interfaces/cmdstan),
# for the LBFGS optimization algorithm
# we recommend leaving these at their defaults.

# Line search step size for first iteration
# init_alpha: 0.001 

# Convergence tolerance on absolute changes in objective function value
# tol_obj: 1.e-12

# Convergence tolerance on relative changes in objective function value
# tol_rel_obj: 10000

# Convergence tolerance on the norm of the gradient
# tol_grad: 1.e-8

# Convergence tolerance on the relative norm of the gradient
# tol_rel_grad: 10000000

# Convergence tolerance on changes in parameter value
# tol_param: 1.e-8

# Amount of history to keep for L-BFGS
# history_size: 5

## Update Options
## ==========================

# DART-ID bootstraps the reference RT (mu), to account for uncertainty
# in the estimation and to penalize mu estimates derived from only a few data points (experiments)
# Ideally we would use the MCMC sampler (STAN) to sample the full posterior, but due to 
# technical/performance constraints we are doing this in python instead

# options -- parametric-mixture, parametric, non-parametric, none
# bootstrap_method: 'parametric_mixture'
# bootstrap_iters: 100

# How to aggregate bootstrapped samples
# The weighted mean uses the PEP of each PSM as the weights
# options -- mean, median, weighted_mean
# mu_estimation: 'median'

## Protein Inference Options
## ==========================

# Run protein inference on the newly updated PSMs with the Fido framework
# https://noble.gs.washington.edu/proj/fido
# Paper in J. Proteome Research: http://dx.doi.org/10.1021/pr100594k

# Most, if not all, parameters described below are also described in detail
# on the Fido website and by the helper tips for the command-line verison of Fido.

# To run protein inference, set this flag to true
# run_pi: true

# Parameters derived from a parameter search and optimizing over an objective
# that minimizes selecting false positives.
# Parameters listed below are the default for fido. Leave these, or specify them,
# to skip the parameter searching step.
# Comment out these three parameters to search for the best set of 3 parameters and
# then run protein inference with those.
# pi_gamma: 0.5
# pi_alpha: 0.1
# pi_beta:  0.01

# Log2 of maximum number of subgraph connected states. Graphs with more states
# than this threshold will be pruned. Increasing this number increases run-time,
# by a lot!
# pi_connected_protein_thresh: 14
# Clean up the peptide sequence string, by removing adjacent amino acids,
# modifications, and also switching isoleucine to leucine.
# pi_clean_peptide_name: false
# Default behavior is to cut all PSMs except for the highest scoring one,
# for each peptide, in order to simplify the graph. Set this to true to include
# all PSMs
# pi_use_all_psms: false
# Use protein group level inference
# pi_group_proteins: false
# Prune low-scoring PSMs from the graph before the main pruning procedure.
# The threshold in this case is 1e-2 (PEP > 0.99)
# pi_prune_low_scores: true
# Accuracy of the parameter selection. This will be ignored if pi_gamma, pi_alpha,
# and pi_beta are provided, as the selection will not be performed in the first place.
# 1 = best    / slower     (uses entire data file)
# 2 = relaxed / faster     (uses 300 observations)
# 3 = sloppy  / very fast  (uses 100 observations)
# pi_parameter_accuracy: 3

# Proteins in the "Proteins" column are assumed to be protein IDs in a string,
# separated by a delimiter, which is specified here:
# i.e., the delimiter is ';' if the "Proteins" string is:
#       "Protein1;Protein2;Protein3;Protein4"
# pi_protein_delimiter: ';'
# A substring that delineates decoy proteins. In the case of MaxQuant,
# all decoy proteins are prepended with the string "REV__"
# pi_decoy_tag: 'REV__'


## Output
## ==========================

# Save the parameters outputted by STAN into three text files.
# Use the "params_folder" option in a future run to use these
# parameters instead of running the alignment procedure again.
# save_params: true

# Default behavior is to only append two columns, the new PEP
# and the updated PEP. Set this to true to get many more columns
# added on.
# add_diagnostic_cols: false

# Overwrite the original PEP column with the updated PEP, and save
# the original PEP to the 'Spectra PEP' column.
# Useful for workflows that rely on the PEP column
# overwrite_pep: false

# Remove PSMs that have an FDR (q-value) below this value.
# 0.01 corresponds to selecting PSMs at an FDR of 1%
# psm_fdr_threshold: 0.01

# Remove PSMs that have an associated protein FDR (q-value) below this value.
# 0.01 corresponds to selecting proteins at an FDR of 1%
# protein_fdr_threshold: 0.01


# If providing multiple input files, combine them all into one
# tabular file and save it.
# save_combined_output: true
# The name of the combined output file.
# combined_output_name: ev_updated.txt

# If providing separate input files, then save the output files separately
# as well. This can be used in conjunction with 'save_combined_output'
# save_separate_output: false
# Save the separate output files into the same folder where they originally
# came from. **WARNING** this program does not check to see if it will overwrite
# an existing file. Please choose the options below carefully to avoid overwriting
# your original data!
# save_in_input_folder: false
# The suffix and extension of each of the separate output files. 
# For example, if one of the inputs was "evidence.txt", 
# the output would be "evidence_updated.txt"
# output_suffix: _updated
# output_ext: .txt

# Save logging messages to file?
# log_file: true

## Filters
## ==========================

# Lower threshold of PEP. PSMs with PEP higher than this value will not be
# considered for the alignment process
# These PSMs can still have their confidence updated, as long as there are 
# PSMs of the same sequence that have PEP below this value
# pep_threshold: 0.5

# Peptide sequences need to be observed in at least this number of experiments,
# at a PEP below the pep_threshold, in order to participate in the alignment process
# num_experiments: 3

# Minimum number of confident PSMs per experiment, in order to participate in RT alignment
# If an experiment has less than this number of confident PSMs, then all of its
# PSMs will be excluded from the RT alignment process
# min_psms_per_experiment: 50