seqgra.learner.protein module

MIT - CSAIL - Gifford Lab - seqgra

Abstract base class for amino acid sequence learners

@author: Konstantin Krismer

class ProteinMultiClassClassificationLearner(model_definition: seqgra.model.model.modeldefinition.ModelDefinition, data_dir: str, output_dir: str, validate_data: bool = True, gpu_id: int = 0, silent: bool = False)[source]

Bases: seqgra.learner.learner.MultiClassClassificationLearner

check_annotations(annotations: List[str])bool
check_labels(y: List[str], throw_exception: bool = True)bool
check_sequence(x: List[str])bool[source]
abstract create_model()None

Abstract method to create library-specific model.

Machine learning library specific implementations are provided for TensorFlow and PyTorch.

dataset_generator(file_name: str)
decode_x(x)[source]

TODO

TODO

Parameters

x (array) – TODO

decode_y(y)[source]

TODO

TODO

Parameters

y (array) – TODO

encode_x(x: List[str])[source]

TODO

TODO

Parameters

x (array) – TODO

encode_y(y: List[str])[source]

TODO

TODO

Parameters

y (array) – TODO

abstract evaluate_model(file_name: Optional[str] = None, x: Optional[List[str]] = None, y: Optional[List[str]] = None)

TODO

TODO

Parameters
  • file_name (Optional[str]) – TODO

  • x (Optional[List[str]]) – TODO

  • y (Optional[List[str]]) – TODO

Returns

TODO

Return type

array

Raises

Exception – if neither file_name nor (x and y) are specified

get_annotations_file(set_name: str = 'test')str

Get path to annotations file.

E.g., get_annotations_file(“training”) returns {OUTPUTDIR}/input/{GRAMMAR ID}/training-annotation.txt, if it exists.

Parameters

set_name (str, optional) – set name can be one of the following: training, validation, or test; defaults to test

Returns

file path to annotations file

Return type

str

Raises

Exception – in case requested annotations file does not exist

get_examples_file(set_name: str = 'test')str

Get path to examples file.

E.g., get_examples_file(“training”) returns {OUTPUTDIR}/input/{GRAMMAR ID}/training.txt, if it exists.

Parameters

set_name (str, optional) – set name can be one of the following: training, validation, or test; defaults to test

Returns

file path to examples file

Return type

str

Raises

Exception – in case requested examples file does not exist

get_label_set(y: List[str])Set[str]
abstract get_num_params()seqgra.schema.ModelSize

TODO

TODO

get_sequence_length(file_name: str)int
abstract load_model(file_name: Optional[str] = None)

TODO

TODO

Parameters

file_name (str, optional) – file name in output dir; default is library-dependent

parse_annotations_data(file_name: str)seqgra.schema.AnnotationSet

Method to parse annotations data file.

Checks validity of annotations.

Parameters

file_name (str) – file name

Returns

annotations (List[str]): annotations y (List[str]): labels

Return type

AnnotationSet

parse_examples_data(file_name: str)seqgra.schema.ExampleSet[source]

Abstract method to parse examples data file.

Checks validity of sequences with sequence data type specific implementations provided for DNA and amino acid sequences.

Parameters

file_name (str) – file name

Returns

x (List[str]): sequences y (List[str]): labels

Return type

ExampleSet

abstract predict(file_name: Optional[str] = None, x: Optional[Any] = None, encode: bool = True)

TODO

TODO

Parameters
  • x (array) – TODO

  • encode (bool, optional) – whether x should be encoded; defaults to True

Raises

Exception – if neither file_name nor x are specified

abstract print_model_summary()None

TODO

TODO

abstract save_model(file_name: Optional[str] = None)

TODO

TODO

Parameters

file_name (str, optional) – file name in output dir; default is library-dependent

abstract set_seed()None

TODO

TODO

train_model(file_name_train: Optional[str] = None, file_name_val: Optional[str] = None, x_train: Optional[List[str]] = None, y_train: Optional[List[str]] = None, x_val: Optional[List[str]] = None, y_val: Optional[List[str]] = None)None

Train model.

Specify either file_name_train and file_name_val or x_train, y_train, x_val, and y_val.

Parameters
  • file_name_train (Optional[str]) – TODO

  • file_name_val (Optional[str]) – TODO

  • x_train (Optional[List[str]]) – TODO

  • y_train (Optional[List[str]]) – TODO

  • x_val (Optional[List[str]]) – TODO

  • y_val (Optional[List[str]]) – TODO

Raises
  • Exception – output directory non-empty

  • Exception – specify either file_name_train and file_name_val or x_train, y_train, x_val, y_val

class ProteinMultiLabelClassificationLearner(model_definition: seqgra.model.model.modeldefinition.ModelDefinition, data_dir: str, output_dir: str, validate_data: bool = True, gpu_id: int = 0, silent: bool = False)[source]

Bases: seqgra.learner.learner.MultiLabelClassificationLearner

check_annotations(annotations: List[str])bool
check_labels(y: List[str], throw_exception: bool = True)bool
check_sequence(x: List[str])bool[source]
abstract create_model()None

Abstract method to create library-specific model.

Machine learning library specific implementations are provided for TensorFlow and PyTorch.

dataset_generator(file_name: str)
decode_x(x)[source]

TODO

TODO

Parameters

x (array) – TODO

decode_y(y)[source]

TODO

TODO

Parameters

y (array) – TODO

encode_x(x: List[str])[source]

TODO

TODO

Parameters

x (array) – TODO

encode_y(y: List[str])[source]

TODO

TODO

Parameters

y (array) – TODO

abstract evaluate_model(file_name: Optional[str] = None, x: Optional[List[str]] = None, y: Optional[List[str]] = None)

TODO

TODO

Parameters
  • file_name (Optional[str]) – TODO

  • x (Optional[List[str]]) – TODO

  • y (Optional[List[str]]) – TODO

Returns

TODO

Return type

array

Raises

Exception – if neither file_name nor (x and y) are specified

get_annotations_file(set_name: str = 'test')str

Get path to annotations file.

E.g., get_annotations_file(“training”) returns {OUTPUTDIR}/input/{GRAMMAR ID}/training-annotation.txt, if it exists.

Parameters

set_name (str, optional) – set name can be one of the following: training, validation, or test; defaults to test

Returns

file path to annotations file

Return type

str

Raises

Exception – in case requested annotations file does not exist

get_examples_file(set_name: str = 'test')str

Get path to examples file.

E.g., get_examples_file(“training”) returns {OUTPUTDIR}/input/{GRAMMAR ID}/training.txt, if it exists.

Parameters

set_name (str, optional) – set name can be one of the following: training, validation, or test; defaults to test

Returns

file path to examples file

Return type

str

Raises

Exception – in case requested examples file does not exist

get_label_set(y: List[str])Set[str]
abstract get_num_params()seqgra.schema.ModelSize

TODO

TODO

get_sequence_length(file_name: str)int
abstract load_model(file_name: Optional[str] = None)

TODO

TODO

Parameters

file_name (str, optional) – file name in output dir; default is library-dependent

parse_annotations_data(file_name: str)seqgra.schema.AnnotationSet

Method to parse annotations data file.

Checks validity of annotations.

Parameters

file_name (str) – file name

Returns

annotations (List[str]): annotations y (List[str]): labels

Return type

AnnotationSet

parse_examples_data(file_name: str)seqgra.schema.ExampleSet[source]

Abstract method to parse examples data file.

Checks validity of sequences with sequence data type specific implementations provided for DNA and amino acid sequences.

Parameters

file_name (str) – file name

Returns

x (List[str]): sequences y (List[str]): labels

Return type

ExampleSet

abstract predict(file_name: Optional[str] = None, x: Optional[Any] = None, encode: bool = True)

TODO

TODO

Parameters
  • x (array) – TODO

  • encode (bool, optional) – whether x should be encoded; defaults to True

Raises

Exception – if neither file_name nor x are specified

abstract print_model_summary()None

TODO

TODO

abstract save_model(file_name: Optional[str] = None)

TODO

TODO

Parameters

file_name (str, optional) – file name in output dir; default is library-dependent

abstract set_seed()None

TODO

TODO

train_model(file_name_train: Optional[str] = None, file_name_val: Optional[str] = None, x_train: Optional[List[str]] = None, y_train: Optional[List[str]] = None, x_val: Optional[List[str]] = None, y_val: Optional[List[str]] = None)None

Train model.

Specify either file_name_train and file_name_val or x_train, y_train, x_val, and y_val.

Parameters
  • file_name_train (Optional[str]) – TODO

  • file_name_val (Optional[str]) – TODO

  • x_train (Optional[List[str]]) – TODO

  • y_train (Optional[List[str]]) – TODO

  • x_val (Optional[List[str]]) – TODO

  • y_val (Optional[List[str]]) – TODO

Raises
  • Exception – output directory non-empty

  • Exception – specify either file_name_train and file_name_val or x_train, y_train, x_val, y_val