oxbow.core.VcfFile#

class oxbow.core.VcfFile(source: str | Callable[[], IO[bytes] | str], compressed: bool = False, *, fields: Literal['*'] | list[str] | None = '*', info_fields: Literal['*'] | list[str] | None = '*', genotype_fields: Literal['*'] | list[str] | None = '*', genotype_by: Literal['sample', 'field'] = 'sample', samples: Literal['*'] | list[str] | None = None, samples_nested: bool = False, coords: Literal['01', '11'] = '11', regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = 131072)#
__init__(source: str | Callable[[], IO[bytes] | str], compressed: bool = False, *, fields: Literal['*'] | list[str] | None = '*', info_fields: Literal['*'] | list[str] | None = '*', genotype_fields: Literal['*'] | list[str] | None = '*', genotype_by: Literal['sample', 'field'] = 'sample', samples: Literal['*'] | list[str] | None = None, samples_nested: bool = False, coords: Literal['01', '11'] = '11', regions: str | list[str] | None = None, index: str | Callable[[], IO[bytes] | str] | None = None, batch_size: int = 131072)#

Methods

__init__(source[, compressed, fields, ...])

batches()

Generate record batches from the data source.

dataset()

Convert the data source into a dataset.

dd([find_divisions])

Convert the data source to a Dask DataFrame.

fragments()

Get fragments of the data source.

pd()

Convert the dataset to a Pandas DataFrame.

pl([lazy])

Convert the data source to a Polars DataFrame or LazyFrame.

regions(regions)

Query one or more genomic ranges within the data source.

scanner()

Create a low-level scanner for the data source.

to_dask([find_divisions])

Convert the data source to a Dask DataFrame.

to_duckdb(conn)

Convert the data source into a DuckDB Relation.

to_ipc()

Serialize the data source as an Arrow IPC stream.

to_pandas()

Convert the dataset to a Pandas DataFrame.

to_polars([lazy])

Convert the data source to a Polars DataFrame or LazyFrame.

with_samples([samples, genotype_fields, ...])

Return a new data source with sample genotype data nested under a single "samples" struct column.

Attributes

chrom_names

List of reference sequence names declared in the header.

chrom_sizes

List of reference sequence names and their lengths in bp.

columns

The top-level column names of the projection.

genotype_field_defs

List of FORMAT field definitions declared in the header.

info_field_defs

List of INFO field definitions declared in the header.

samples

List of sample IDs declared in the header.

schema

The arrow schema of the projection.

batches() Generator#

Generate record batches from the data source.

Yields:

RecordBatch – A record batch from the data source.

property chrom_names: list[str]#

List of reference sequence names declared in the header.

property chrom_sizes: list[tuple[str, int]]#

List of reference sequence names and their lengths in bp.

property columns: list[str]#

The top-level column names of the projection.

dataset() BatchReaderDataset#

Convert the data source into a dataset.

A dataset is a collection of fragments that can be processed as a single logical entity.

Returns:

A dataset representation of the data source.

Return type:

BatchReaderDataset

dd(find_divisions=False)#

Convert the data source to a Dask DataFrame.

Parameters:

find_divisions (bool, optional) – If True, find divisions for the Dask DataFrame, by default False.

Returns:

A Dask DataFrame representation of the data source.

Return type:

dask.dataframe.DataFrame

fragments() list[BatchReaderFragment]#

Get fragments of the data source.

Fragments represent parts of the data source that can be processed independently.

Returns:

A list of fragments representing parts of the data source.

Return type:

list of BatchReaderFragment

property genotype_field_defs: list[tuple[str, str, str]]#

List of FORMAT field definitions declared in the header.

property info_field_defs: list[tuple[str, str, str]]#

List of INFO field definitions declared in the header.

pd()#

Convert the dataset to a Pandas DataFrame.

Returns:

A Pandas DataFrame representation of the dataset.

Return type:

pandas.DataFrame

pl(lazy=False)#

Convert the data source to a Polars DataFrame or LazyFrame.

Parameters:

lazy (bool, optional [default: False]) – If True, returns a LazyFrame.

Returns:

A polars representation of the data source.

Return type:

polars.DataFrame | polars.LazyFrame

regions(regions: str | list[str]) Self#

Query one or more genomic ranges within the data source.

This method creates a new instance of the data source with the same parameters, overriding the regions to select from the data source.

Parameters:

regions (str | list[str]) – The regions to select from the data source. This can be a single region or a list of regions.

Return type:

DataSource

Notes

Genomic range strings can be in the following formats:

  • UCSC-style "chr:start-end": intepreted using the coordinate system of the data source.

  • Bracket-style "chr:[start,end]": explicitly 1-based, end-inclusive.

  • Bracket-style "chr:[start,end)": explicitly 0-based, end-exclusive.

property samples: list[str]#

List of sample IDs declared in the header.

scanner() Any#

Create a low-level scanner for the data source.

property schema: Schema#

The arrow schema of the projection.

to_dask(find_divisions=False)#

Convert the data source to a Dask DataFrame.

Parameters:

find_divisions (bool, optional) – If True, find divisions for the Dask DataFrame, by default False.

Returns:

A Dask DataFrame representation of the data source.

Return type:

dask.dataframe.DataFrame

to_duckdb(conn)#

Convert the data source into a DuckDB Relation.

Parameters:

conn (duckdb.DuckDBPyConnection) – The DuckDB connection.

Returns:

A DuckDB Relation representation of the data source.

Return type:

duckdb.DuckDBPyRelation

to_ipc() bytes#

Serialize the data source as an Arrow IPC stream.

Returns:

The serialized data source in Arrow IPC format.

Return type:

bytes

to_pandas()#

Convert the dataset to a Pandas DataFrame.

Returns:

A Pandas DataFrame representation of the dataset.

Return type:

pandas.DataFrame

to_polars(lazy=False)#

Convert the data source to a Polars DataFrame or LazyFrame.

Parameters:

lazy (bool, optional [default: False]) – If True, returns a LazyFrame.

Returns:

A polars representation of the data source.

Return type:

polars.DataFrame | polars.LazyFrame

with_samples(samples: Literal['*'] | list[str] | None = '*', *, genotype_fields: Literal['*'] | list[str] | None = '*', group_by: Literal['sample', 'field'] = 'sample') Self#

Return a new data source with sample genotype data nested under a single "samples" struct column.

Parameters:
  • samples ("*", list[str], or None, optional [default: "*"]) – Names of samples to include in the genotype output. "*" includes all samples declared in the header. Pass a list to select specific samples. None omits all sample genotype data.

  • genotype_fields ("*", list[str], or None, optional [default: "*"]) – Genotype (aka FORMAT) fields to project for each sample. "*" includes all FORMAT fields declared in the header. Pass a list to select specific fields. None omits all genotype fields.

  • group_by (Literal["sample", "field"], optional [default: "sample"]) – Determines how genotype data is organized within the "samples" struct. If "sample", each sample name is a sub-column with nested genotype fields. If "field", each genotype field is a sub-column with nested sample values.

Returns:

A new data source with sample genotype data nested under a single "samples" struct column.

Return type:

Self