non-10x input (just fasta) e.g. immunoseq

### Description of the issue

Feature request by @s.irac@garvan.org.au

`I would like to check if we could run the data output from the immunoseq ? I have a BCR dataset that I would like to analyse via your package and output different than 10x. I wonder if you have any script to import the data analysis via your package pipeline.`


---

Implementation will be something like:

```python
import os
import pandas as pd
import shutil

from pathlib import Path
from tqdm import tqdm
from typing import List, Optional

from dandelion.utilities._io import fasta_iterator, Write_output


def prepare_non10x_fasta(
    fasta: str,
    outdir: Optional[str] = None,
):
    """
    Prepare a non-10x fasta so that it can be ingested like for downstream analysis.

    Parameters
    ----------
    fasta : str
        path to fasta file.
    outdir : Optional[str], optional
        path to output location. `None` defaults to 'dandelion'.
    """
    fh = open(fasta, "r")
    seqs = {}
    for header, sequence in fasta_iterator(fh):
        seqs[header] = sequence
    fh.close()
    basedir = os.path.dirname(fasta)
    if outdir is None:
        outdir = basedir.rstrip("/") + "/" + Path(os.path.basename(fasta)).stem
    if not outdir.endswith("/"):
        outdir = outdir + "/"

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    out_fasta = outdir + "all_contig.fasta"
    out_anno_path = outdir + "all_contig_annotations.csv"
    fh1 = open(out_fasta, "w")
    fh1.close()
    out = ""
    anno = []
    for l in seqs:
        out = ">" + l + "-1_contig-1" + "\n" + seqs[l] + "\n"
        Write_output(out, out_fasta)
        # also create a dummy contig_annotations.csv
        defaultrow = {
            "barcode": l,
            "is_cell": "TRUE",
            "contig_id": l + "-1_contig-1",
            "high_confidence": "TRUE",
            "length": str(len(seqs[l])),
            "chain": "None",
            "v_gene": "None",
            "d_gene": "None",
            "j_gene": "None",
            "c_gene": "None",
            "full_length": "TRUE",
            "productive": "TRUE",
            "cdr3": "None",
            "cdr3_nt": "None",
            "reads": 1,
            "umis": 1,
            "raw_clonotype_id": "None",
            "raw_consensus_id": "None",
        }
        anno.append(defaultrow)
    anno = pd.DataFrame(anno)
    anno.to_csv(out_anno_path, index=False)


def prepare_non10x_fastas(
    fastas: List[str],
    outdir: Optional[str] = None,
):
    """
    Prepare a non-10x fastas so that it can be ingested like for downstream analysis.

    Parameters
    ----------
    fastas : List[str]
        list of paths to fasta files.
    outdir : Optional[str], optional
        path to out put location.
    """
    if type(fastas) is not list:
        fastas = [fastas]

    for i in tqdm(
        range(0, len(fastas)),
        desc="Formating fasta(s) ",
        bar_format="{l_bar}{bar:10}{r_bar}{bar:-10b}",
    ):
        prepare_non10x_fasta(
            fastas[i],
            outdir=outdir,
        )
```

usage would be:

```python
import os
import dandelion as ddl
files = [
    "/Users/kt16/Downloads/immunoseqtest/sample-1.fasta",
    "/Users/kt16/Downloads/immunoseqtest/sample-2.fasta",
]
prepare_non10x_fastas(files)
# and then either just use the singularity image from here onwards

# singularity run sc-dandelion.sif dandelion-preprocess 

# or just do it manually
os.chdir("/Users/kt16/Downloads/immunoseqtest/")
samples = ["sample-1", "sample-2"]
ddl.pp.format_fastas(samples, prefix=samples, filename_prefix=["all", "all"])
ddl.pp.reannotate_genes(samples, filename_prefix="all") # etc
```



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

non-10x input (just fasta) e.g. immunoseq #232

Description of the issue

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

non-10x input (just fasta) e.g. immunoseq #232

Description

Description of the issue

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions