Amplicon BIOM#

This document contains many functions for all kinds of amplicon analysis used in unlock

Connection settings#

from irods.session import iRODSSession
import ssl
from irods.meta import iRODSMeta
import os, sys

# iRODS authentication information
host = "unlock-icat.irods.surfsara.nl"
port = 1247
zone = "unlock"
user = os.getenv('irodsUserName')
password = os.getenv('irodsPassword')

# SSL settings
context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)

ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',
                'irods_client_server_policy': 'CS_NEG_REQUIRE',
                'irods_encryption_algorithm': 'AES-256-CBC',
                'irods_encryption_key_size': 32,
                'irods_encryption_num_hash_rounds': 16,
                'irods_encryption_salt_size': 8,
                "irods_authentication_scheme": "pam",
                'ssl_context': context}
                

Amplicon#

Amplicon data retrieval#

Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file.

from irods.models import Collection, DataObject
from irods.column import Criterion
import os

# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
    # Set the filters to select the files needed for downloading
    results = session.query(Collection, DataObject).filter( \
        Criterion('like', Collection.name, '/unlock/home/wur.unlock.hivghana_drp0070%_PHYLOSEQ'))
    
    data_files = set()
    for index, r in enumerate(results):
        # Download the result files
        data_file = r[Collection.name] + "/" + r[DataObject.name]
        data_files.add(data_file)
    for index, file_path in enumerate(data_files):
        # Original path in irods is local path from current location
        folder_path = os.path.dirname(file_path)
        if not os.path.exists("." + file_path):
            os.makedirs("." + folder_path, exist_ok=True)
            print("Downloading: " + str(index) + " of " + str(len(data_files)) + " " + file_path, end="\r")
            session.data_objects.get(file_path, "." + file_path)
        else:
            print("Already received: " + file_path + " "*10, end="\r")
Already received: /unlock/home/wur.unlock.hivghana_drp0070/STU_DRP007092/OBS_XDRS176960/SAM_DRS176960/amplicon/ASY_DRR243924/NGTAX_Silva138.1-picrust_100_DRR243924/4_PHYLOSEQ/DRR243924_seq.tsv          
destinations = {
    "seq.tsv" : open("seq.tsv", "w"), 
    "asv.tsv" : open("asv.tsv", "w"),
    "tax.tsv" : open("tax.tsv", "w"),
    "met.tsv" : open("met.tsv", "w")
}

# Merge all tsv files into the appropiate file object
for index, data_file in enumerate(data_files):
    print("Processing ", index, end="\r")
    id = data_file.split("_")[-1]
    content = open("." + data_file).read()
    print(content, file=destinations[id])

# Close all destination files
for destination in destinations:
    destinations[destination].close()
Processing  35
# Process taxonomy
import pandas as pd

lines = []
for line in open("tax.tsv"):
    line = line.strip().split("\t")
    if len(line) < 8:
        line = line[:8] + ["NA"]*(8 - len(line))
    formatted = [line[0], "k__" + line[1]+ "; p__" + line[2]+ "; c__" + line[3]+ "; o__" + line[4]+ "; f__" + line[5]+ "; g__" + line[6]+ "; s__" + line[7]]
    lines.append(formatted)
tax_df = pd.DataFrame(lines)
tax_df.columns = ["ID", "taxonomy"]
tax_df = tax_df.set_index("ID")

print(tax_df.head())
                                                                                             taxonomy
ID                                                                                                   
                                                      k__NA; p__NA; c__NA; o__NA; f__NA; g__NA; s__NA
01a0bec405b65f107a84eb1bdaf0ccbbcc177f487bab236...  k__Bacteria; p__Firmicutes; c__Bacilli; o__RF3...
01dc299b033cd3ddc735f033312a8ca6bfdee54fc12c2f9...  k__Bacteria; p__Firmicutes; c__Clostridia; o__...
02db5a224a4f92228a2e78129f5ce999bde8f9d40fba844...  k__Bacteria; p__Firmicutes; c__Clostridia; o__...
04a66337670a05caa7ff58d5d9516acac7646344dea7356...  k__Bacteria; p__Firmicutes; c__Clostridia; o__...
# Load and transform the ASV file into a matrix
lines = []
size = {"nRow": set(), "nCol": set()}
# Read the asv.tsv file
for line in open("asv.tsv"):
    line = line.strip().split("\t")
    # Skip header
    if len(line) == 1: continue
    # Add to lines
    lines.append(line)
    # Row sample
    size["nRow"].add(line[1])
    # Column ASV
    size["nCol"].add(line[0])

# Three columns to matrix method    
asv_df = pd.DataFrame(index=range(len(size["nRow"])),columns=range(len(size["nCol"])))
asv_df.columns = size["nCol"]

# Sort the index
asv_index = sorted(list(size["nRow"]))
asv_df.index = asv_index

# Fill the matrix with the values from the lines
for index, line in enumerate(lines):
    if index % 10000 == 0:
        print(index, len(lines), end="\r")
    # Clean the sample, asv and value
    sample, asv, value = line
    sample = sample.strip()
    asv = asv.strip()
    value = value.strip()
    # Set the value to the matrix using the sample and asv as index
    asv_df.loc[asv, sample]=value
0 8800
# Merge the ASV and taxonomy dataframes
result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)
# Remove duplicates
result = result.drop_duplicates(keep="first")
# Fill NaN with 0
result.fillna(0, inplace=True)

# Rename the rownames
row_names = result.index.values
# Replace the rownames with ASV_1, ASV_2, ...
for index, value in enumerate(row_names):
    row_names[index] = "ASV_" + str(index + 1)
result.index = row_names

# Write to file
result.to_csv("merged.tsv", sep="\t")

# Load and fix first line
lines = open("merged.tsv").readlines()
# Add header
lines.insert(0, "# Automatically generated input file for biom conversion")
# Add OTU ID header
lines[1] = "#OTU ID" + lines[1]
# Write to file
output = open("merged.tsv", "w")
for line in lines:
    print(line.strip(), file=output)
# Load biom and json
from biom import load_table
import json

# Convert merged.tsv to biom file
biom_content = load_table("merged.tsv")

# Load metadata file
sample_ids = set(biom_content._sample_ids)
metadata = {}
keys = set()

# Read the met.tsv file
for index, line in enumerate(open("met.tsv")):
    line = line.strip()
    # Skip empty lines
    if "\t" not in line: continue
    identifier, key, value = line.split("\t")
    # Bug in biom reader, all metadata need to have the same keys in the same order
    keys.add(key)
    # Skip lines that are not in this biom object
    if identifier not in sample_ids: continue
    if identifier not in metadata:
        metadata[identifier] = {}
    metadata[identifier][key] = str(value)

# Add missing keys to metadata using a sorted list
keys = sorted(keys)
# Add missing keys to metadata
for identifier in metadata:
    for key in keys:
        if key not in metadata[identifier]:
            metadata[identifier][key] = "None"

# Add metadata
biom_content.add_metadata(metadata)
# Set biom type
biom_content.type = "OTU table"
json_data = biom_content.to_json(generated_by="UNLOCK conversion module")

# Create Python object from JSON string data
obj = json.loads(json_data)

# Fix taxonomy split issue
for index, row in enumerate(obj["rows"]):
    row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split("; ")

# Pretty Print JSON
json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)

# Create biom file
biom_file = "merged.biom"
print("Writing biom file to", biom_file)
print(json_formatted_str, file=open(biom_file, "w"))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[17], line 2
      1 # Load biom and json
----> 2 from biom import load_table
      3 import json
      5 # Convert merged.tsv to biom file

File ~/anaconda3/lib/python3.9/site-packages/biom/__init__.py:51
      2 r"""
      3 Quick start
      4 ===========
   (...)
     41 
     42 """
     43 # ----------------------------------------------------------------------------
     44 # Copyright (c) 2011-2020, The BIOM Format Development Team.
     45 #
   (...)
     48 # The full license is in the file COPYING.txt, distributed with this software.
     49 # ----------------------------------------------------------------------------
---> 51 from .table import Table
     52 from .parse import parse_biom_table as parse_table, load_table
     53 from .util import __format_version__, __version__

File ~/anaconda3/lib/python3.9/site-packages/biom/table.py:195
    190 from biom.util import (get_biom_format_version_string,
    191                        get_biom_format_url_string, flatten, natsort,
    192                        prefer_self, index_list, H5PY_VLEN_STR, HAVE_H5PY,
    193                        __format_version__)
    194 from biom.err import errcheck
--> 195 from ._filter import _filter
    196 from ._transform import _transform
    197 from ._subsample import _subsample

File ~/anaconda3/lib/python3.9/site-packages/biom/_filter.pyx:1, in init biom._filter()

ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

Amplicon#

Picrust data retrieval#

from irods.models import Collection, DataObject
from irods.column import Criterion
import os

# Standard authentication
with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:
    # Set the filters to select the files needed for downloading
    results = session.query(Collection, DataObject).filter( \
        Criterion('like', Collection.name, '%STU_BO3B-BR1%')).filter( \
        Criterion('like', DataObject.name, '%_predicted.tsv.gz'))
    
    for r in results:
        # Download the result files
        data_file = r[Collection.name] + "/" + r[DataObject.name]
        # Original path in irods is local path from current location
        if not os.path.exists("." + data_file):
            os.makedirs("." + r[Collection.name], exist_ok=True)
            print("Downloading: " + data_file, end="\r")
            session.data_objects.get(data_file, "." + data_file)
        else:
            print("Already received: " + data_file + " "*10, end="\r")