Polars module

Module summary

Polars module uses polars dataframe library, relying on its built-in multiprocessing and query optimization engine to allow you to run your queries blazingly fast. If you have a big of data (hundreds of thousands and more), it's strongly preferred you use this interface.

It's preferred that you use functional interface for the polars module, i.e. call immunum.polars.number() and immunum.polars.segment() directly instead of calling immunum.polars.numbering_method(...). Those methods provide the same functionality, and will be used in future for runtime-buit annotators, but currently this syntax is just less convenient and requires building annotators in advance.

Also, note that if you don't need a column, don't materialize it afterwards -- on big dataframes, queries will get significantly faster. For instance, example below runs in under a second for dataset with 8M entries (200,000 of them are actual paired sequences), while full numbering would take around 10 times more.

Example: calculating unique cdr3_light/heavy pairs

Segment heavy and light chains in parallel, then deduplicate on their CDR3 sequences. Note that .unique() is called before materializing any other columns — this avoids loading the full FR region strings for every row.

import pytest
import polars as pl
import immunum.polars as imp


def unique_cdr3_pairs(source: str) -> pl.DataFrame:
    return (
        pl.scan_parquet(source)
        .filter(
            pl.all_horizontal(
                pl.col("sequence_alignment_aa_heavy").is_not_null(),
                pl.col("sequence_alignment_aa_light").is_not_null(),
            )
        )
        .select(
            imp.segment("sequence_alignment_aa_heavy", chains=["IGH"], scheme="IMGT")
            .name.suffix_fields("_heavy")
            .struct.unnest(),
            imp.segment("sequence_alignment_aa_light", chains=["IGL", "IGK"], scheme="IMGT")
            .name.suffix_fields("_light")
            .struct.unnest(),
        )
        .unique(subset=["cdr3_heavy", "cdr3_light"])
        .select("cdr3_heavy", "cdr3_light")
        .head(20)
        .collect(engine="streaming")
    )


pytest.skip("requires parquet data files")
print(unique_cdr3_pairs("source.parquet"))

# shape: (20, 2)
# ┌──────────────────────┬──────────────┐
# │ cdr3_heavy           ┆ cdr3_light   │
# │ ---                  ┆ ---          │
# │ str                  ┆ str          │
# ╞══════════════════════╪══════════════╡
# │ ARDLSQGYFDY          ┆ QQYYSTPYT    │
# │ ASLRGITGTTDS         ┆ AAWDDSLKGVV  │
# │ ARERPQALCFDP         ┆ QQYSSSLYT    │
# │ AKDLEGKHHYFDY        ┆ QQYGSSLWT    │
# │ ARAGSWNKDYYDSSGPLDY  ┆ QQYNSYFTWT   │
# │ …                    ┆ …            │
# │ ALTSGYSSGWPFDY       ┆ QQYYSTPLT    │
# │ ARHRSIAARPAIYYYYYMDV ┆ QSYDSSLSGFYV │
# │ ARNLRGYSYGPDAFDI     ┆ QQYDNLPYT    │
# │ VRSPGWSFDF           ┆ QQYGSSPSPMYT │
# │ ARQKVGSRSPNWYFDL     ┆ QQYNNWPRT    │
# └──────────────────────┴──────────────┘

Example: matching with another dataframe on framework sets

For each unique light+heavy framework combination in the source dataset, count how many sequences in the reference dataset share the same frameworks, then return the 20 most common groups.

import pytest
import polars as pl
import immunum.polars as imp

HEAVY = "sequence_alignment_aa_heavy"
LIGHT = "sequence_alignment_aa_light"
FRAMEWORK_COLS = ["fr1_heavy", "fr2_heavy", "fr3_heavy", "fr1_light", "fr2_light", "fr3_light"]


def top_framework_groups(source: str, target: str, n: int = 10_000) -> pl.DataFrame:
    def _segment(path: str) -> pl.LazyFrame:
        return (
            pl.scan_parquet(path)
            .filter(
                pl.all_horizontal(
                    pl.col(HEAVY).is_not_null(),
                    pl.col(LIGHT).is_not_null(),
                )
            )
            .head(n)
            .select(
                imp.segment(HEAVY, chains=["IGH"], scheme="IMGT")
                .name.suffix_fields("_heavy")
                .struct.unnest(),
                imp.segment(LIGHT, chains=["IGK", "IGL"], scheme="IMGT")
                .name.suffix_fields("_light")
                .struct.unnest(),
            )
        )

    return (
        _segment(source)
        .join(_segment(target), on=FRAMEWORK_COLS, how="inner")
        .group_by(FRAMEWORK_COLS)
        .agg(pl.len().alias("count"))
        .sort("count", descending=True)
        .head(20)
        .collect(engine="streaming")
    )


pytest.skip("requires parquet data files")
print(top_framework_groups("source.parquet", "target.parquet"))

# shape: (20, 7)
# ┌───────────────────────────┬───────────────────┬─────────────────────────────────┬────────────────────────────┬───────────────────┬─────────────────────────────────┬───────┐
# │ fr1_heavy                 ┆ fr2_heavy         ┆ fr3_heavy                       ┆ fr1_light                  ┆ fr2_light         ┆ fr3_light                       ┆ count │
# │ ---                       ┆ ---               ┆ ---                             ┆ ---                        ┆ ---               ┆ ---                             ┆ ---   │
# │ str                       ┆ str               ┆ str                             ┆ str                        ┆ str               ┆ str                             ┆ u32   │
# ╞═══════════════════════════╪═══════════════════╪═════════════════════════════════╪════════════════════════════╪═══════════════════╪═════════════════════════════════╪═══════╡
# │ EVQLLESGGGLVQPGGSLRLSCAAS ┆ MSWVRQAPGKGLEWVSA ┆ YYADSVKGRFTISRDNSKNTLYLQMNSLRA… ┆ EIVLTQSPGTLSLSPGERATLSCRAS ┆ LAWYQQKPGQAPRLLIY ┆ SRATGIPDRFSGSGSGTDFTLTISRLEPED… ┆ 3248  │
# │ QVQLVESGGGVVQPGRSLRLSCAAS ┆ MHWVRQAPGKGLEWVAV ┆ YYADSVKGRFTISRDNSKNTLYLQMNSLRA… ┆ DIQMTQSPSSLSASVGDRVTITCRAS ┆ LNWYQQKPGKAPKLLIY ┆ SLQSGVPSRFSGSGSGTDFTLTISSLQPED… ┆ 2025  │
# │ QVQLQESGPGLVKPSETLSLTCTVS ┆ WSWIRQPPGKGLEWIGY ┆ NYNPSLKSRVTISVDTSKNQFSLKLSSVTA… ┆ DIQMTQSPSSLSASVGDRVTITCRAS ┆ LNWYQQKPGKAPKLLIY ┆ SLQSGVPSRFSGSGSGTDFTLTISSLQPED… ┆ 1998  │
# │ EVQLLESGGGLVQPGGSLRLSCAAS ┆ MSWVRQAPGKGLEWVSA ┆ YYADSVKGRFTISRDNSKNTLYLQMNSLRA… ┆ EIVMTQSPATLSVSPGERATLSCRAS ┆ LAWYQQKPGQAPRLLIY ┆ TRATGIPARFSGSGSGTEFTLTISSLQSED… ┆ 1961  │
# │ QVQLQESGPGLVKPSETLSLTCTVS ┆ WSWIRQPPGKGLEWIGY ┆ NYNPSLKSRVTISVDTSKNQFSLKLSSVTA… ┆ EIVLTQSPGTLSLSPGERATLSCRAS ┆ LAWYQQKPGQAPRLLIY ┆ SRATGIPDRFSGSGSGTDFTLTISRLEPED… ┆ 1872  │
# │ …                         ┆ …                 ┆ …                               ┆ …                          ┆ …                 ┆ …                               ┆ …     │
# │ EVQLVESGGGLVQPGRSLRLSCAAS ┆ MHWVRQAPGKGLEWVSG ┆ GYADSVKGRFTISRDNAKNSLYLQMNSLRA… ┆ EIVLTQSPGTLSLSPGERATLSCRAS ┆ LAWYQQKPGQAPRLLIY ┆ SRATGIPDRFSGSGSGTDFTLTISRLEPED… ┆ 990   │
# │ EVQLLESGGGLVQPGGSLRLSCAAS ┆ MSWVRQAPGKGLEWVSA ┆ YYADSVKGRFTISRDNSKNTLYLQMNSLRA… ┆ EIVLTQSPATLSLSPGERATLSCRAS ┆ LAWYQQKPGQAPRLLIY ┆ NRATGIPARFSGSGSGTDFTLTISSLEPED… ┆ 920   │
# │ QVQLVESGGGVVQPGRSLRLSCAAS ┆ MHWVRQAPGKGLEWVAV ┆ YYADSVKGRFTISRDNSKNTLYLQMNSLRA… ┆ DIQMTQSPSTLSASVGDRVTITCRAS ┆ LAWYQQKPGKAPKLLIY ┆ SLESGVPSRFSGSGSGTEFTLTISSLQPDD… ┆ 900   │
# │ QVQLVQSGAEVKKPGSSVKVSCKAS ┆ ISWVRQAPGQGLEWMGG ┆ NYAQKFQGRVTITADESTSTAYMELSSLRS… ┆ EIVLTQSPGTLSLSPGERATLSCRAS ┆ LAWYQQKPGQAPRLLIY ┆ SRATGIPDRFSGSGSGTDFTLTISRLEPED… ┆ 899   │
# │ EVQLVESGGGLVQPGGSLRLSCAAS ┆ MSWVRQAPGKGLEWVAN ┆ YYVDSVKGRFTISRDNAKNSLYLQMNSLRA… ┆ DIQMTQSPSSLSASVGDRVTITCRAS ┆ LNWYQQKPGKAPKLLIY ┆ SLQSGVPSRFSGSGSGTDFTLTISSLQPED… ┆ 868   │
# └───────────────────────────┴───────────────────┴─────────────────────────────────┴────────────────────────────┴───────────────────┴─────────────────────────────────┴───────┘

Polars integrations

Ecosystem of polars has many handy tools for data engineering -- for instance, you can check out awesome-polars list. Here we provide examples of two packages that we use actively: polars-distance providing convenient (and fast!) interface for string distance computations, and polars-bio providing direct interface for reading biological data formats (fasta/fastq/...) into polars dataframes.

Example: annotating on-the-fly with polars-bio

polars-bio can scan biological sequence files (FASTA, FASTQ) directly as a LazyFrame. Chain that with imp.segment to annotate without an intermediate file:

import pytest
import polars as pl
pb = pytest.importorskip("polars-bio")
import immunum.polars as imp


def annotate_fasta(path: str) -> pl.DataFrame:
    return (
        pb.scan_fasta(path)
        .select(
            pl.col("name"),
            imp.segment("sequence", chains=["IGH", "IGK", "IGL"], scheme="IMGT")
            .struct.unnest(),
        )
        .collect(engine="streaming")
    )


pytest.skip("requires polars_bio and a FASTA file")
print(annotate_fasta("sequences.fasta"))

Example: computing minimal distance between CDR3s in two datasets

Cross-join two CDR3 sets and keep the nearest neighbour by Levenshtein distance. Pre-filter to unique CDR3s and, if the datasets are large, narrow the search to equal-length sequences first to reduce the cross-product size.

import pytest
import polars as pl
pld = pytest.importorskip("polars_distance")
import immunum.polars as imp


def nearest_cdr3s(source: str, target: str) -> pl.DataFrame:
    def _extract(path: str, alias: str) -> pl.DataFrame:
        return (
            pl.scan_parquet(path)
            .select(
                imp.segment("sequence_aa", chains=["IGH"], scheme="IMGT")
                .struct.field("cdr3")
                .alias(alias)
            )
            .filter(pl.col(alias).is_not_null())
            .unique(alias)
            .collect()
        )

    return (
        _extract(source, "cdr3_a")
        .join(_extract(target, "cdr3_b"), how="cross")
        .select(
            pl.col("cdr3_a"),
            pl.col("cdr3_b"),
            pld.col("cdr3_a").dist_str.levenshtein("cdr3_b").alias("dist"),
        )
        .group_by("cdr3_a")
        .agg(
            pl.col("dist").min().alias("min_dist"),
            pl.col("cdr3_b").sort_by("dist").first().alias("nearest_cdr3"),
        )
        .sort("min_dist")
        .head(20)
    )


pytest.skip("requires polars_distance and parquet data files")
print(nearest_cdr3s("source.parquet", "target.parquet"))

# shape: (20, 3)
# ┌───────────────────┬──────────┬───────────────────┐
# │ cdr3_a            ┆ min_dist ┆ nearest_cdr3      │
# │ ---               ┆ ---      ┆ ---               │
# │ str               ┆ u32      ┆ str               │
# ╞═══════════════════╪══════════╪═══════════════════╡
# │ ARDRDDPMADYHPLFDS ┆ 0        ┆ ARDRDDPMADYHPLFDS │
# │ ARESPPRLGHWYFDL   ┆ 0        ┆ ARESPPRLGHWYFDL   │
# │ ARESGRGVVSPYFDP   ┆ 0        ┆ ARESGRGVVSPYFDP   │
# │ ARHRGSTINIPYFDY   ┆ 0        ┆ ARHRGSTINIPYFDY   │
# │ ARDGGYSGSPWYYFDY  ┆ 0        ┆ ARDGGYSGSPWYYFDY  │
# │ …                 ┆ …        ┆ …                 │
# │ AKERGSTGSAINY     ┆ 0        ┆ AKERGSTGSAINY     │
# │ AATTRDWFDP        ┆ 0        ┆ AATTRDWFDP        │
# │ ARDPDTSNKIDY      ┆ 0        ┆ ARDPDTSNKIDY      │
# │ ARDRSSDY          ┆ 0        ┆ ARDRSSDY          │
# │ ATHWDWRFDN        ┆ 0        ┆ ATHWDWRFDN        │
# └───────────────────┴──────────┴───────────────────┘