{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Amplicon BIOM\n", "\n", "This document contains many functions for all kinds of amplicon analysis used in unlock" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Connection settings" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from irods.session import iRODSSession\n", "import ssl\n", "from irods.meta import iRODSMeta\n", "import os, sys\n", "\n", "# iRODS authentication information\n", "host = \"unlock-icat.irods.surfsara.nl\"\n", "port = 1247\n", "zone = \"unlock\"\n", "user = os.getenv('irodsUserName')\n", "password = os.getenv('irodsPassword')\n", "\n", "# SSL settings\n", "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n", "\n", "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n", " 'irods_client_server_policy': 'CS_NEG_REQUIRE',\n", " 'irods_encryption_algorithm': 'AES-256-CBC',\n", " 'irods_encryption_key_size': 32,\n", " 'irods_encryption_num_hash_rounds': 16,\n", " 'irods_encryption_salt_size': 8,\n", " \"irods_authentication_scheme\": \"pam\",\n", " 'ssl_context': context}\n", " " ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Amplicon\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Amplicon data retrieval\n", "Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Already received: /unlock/home/wur.unlock.hivghana_drp0070/STU_DRP007092/OBS_XDRS176960/SAM_DRS176960/amplicon/ASY_DRR243924/NGTAX_Silva138.1-picrust_100_DRR243924/4_PHYLOSEQ/DRR243924_seq.tsv \r" ] } ], "source": [ "from irods.models import Collection, DataObject\n", "from irods.column import Criterion\n", "import os\n", "\n", "# Standard authentication\n", "with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:\n", " # Set the filters to select the files needed for downloading\n", " results = session.query(Collection, DataObject).filter( \\\n", " Criterion('like', Collection.name, '/unlock/home/wur.unlock.hivghana_drp0070%_PHYLOSEQ'))\n", " \n", " data_files = set()\n", " for index, r in enumerate(results):\n", " # Download the result files\n", " data_file = r[Collection.name] + \"/\" + r[DataObject.name]\n", " data_files.add(data_file)\n", " for index, file_path in enumerate(data_files):\n", " # Original path in irods is local path from current location\n", " folder_path = os.path.dirname(file_path)\n", " if not os.path.exists(\".\" + file_path):\n", " os.makedirs(\".\" + folder_path, exist_ok=True)\n", " print(\"Downloading: \" + str(index) + \" of \" + str(len(data_files)) + \" \" + file_path, end=\"\\r\")\n", " session.data_objects.get(file_path, \".\" + file_path)\n", " else:\n", " print(\"Already received: \" + file_path + \" \"*10, end=\"\\r\")\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing 35\r" ] } ], "source": [ "destinations = {\n", " \"seq.tsv\" : open(\"seq.tsv\", \"w\"), \n", " \"asv.tsv\" : open(\"asv.tsv\", \"w\"),\n", " \"tax.tsv\" : open(\"tax.tsv\", \"w\"),\n", " \"met.tsv\" : open(\"met.tsv\", \"w\")\n", "}\n", "\n", "# Merge all tsv files into the appropiate file object\n", "for index, data_file in enumerate(data_files):\n", " print(\"Processing \", index, end=\"\\r\")\n", " id = data_file.split(\"_\")[-1]\n", " content = open(\".\" + data_file).read()\n", " print(content, file=destinations[id])\n", "\n", "# Close all destination files\n", "for destination in destinations:\n", " destinations[destination].close()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " taxonomy\n", "ID \n", " k__NA; p__NA; c__NA; o__NA; f__NA; g__NA; s__NA\n", "01a0bec405b65f107a84eb1bdaf0ccbbcc177f487bab236... k__Bacteria; p__Firmicutes; c__Bacilli; o__RF3...\n", "01dc299b033cd3ddc735f033312a8ca6bfdee54fc12c2f9... k__Bacteria; p__Firmicutes; c__Clostridia; o__...\n", "02db5a224a4f92228a2e78129f5ce999bde8f9d40fba844... k__Bacteria; p__Firmicutes; c__Clostridia; o__...\n", "04a66337670a05caa7ff58d5d9516acac7646344dea7356... k__Bacteria; p__Firmicutes; c__Clostridia; o__...\n" ] } ], "source": [ "# Process taxonomy\n", "import pandas as pd\n", "\n", "lines = []\n", "for line in open(\"tax.tsv\"):\n", " line = line.strip().split(\"\\t\")\n", " if len(line) < 8:\n", " line = line[:8] + [\"NA\"]*(8 - len(line))\n", " formatted = [line[0], \"k__\" + line[1]+ \"; p__\" + line[2]+ \"; c__\" + line[3]+ \"; o__\" + line[4]+ \"; f__\" + line[5]+ \"; g__\" + line[6]+ \"; s__\" + line[7]]\n", " lines.append(formatted)\n", "tax_df = pd.DataFrame(lines)\n", "tax_df.columns = [\"ID\", \"taxonomy\"]\n", "tax_df = tax_df.set_index(\"ID\")\n", "\n", "print(tax_df.head())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 8800\r" ] } ], "source": [ "# Load and transform the ASV file into a matrix\n", "lines = []\n", "size = {\"nRow\": set(), \"nCol\": set()}\n", "# Read the asv.tsv file\n", "for line in open(\"asv.tsv\"):\n", " line = line.strip().split(\"\\t\")\n", " # Skip header\n", " if len(line) == 1: continue\n", " # Add to lines\n", " lines.append(line)\n", " # Row sample\n", " size[\"nRow\"].add(line[1])\n", " # Column ASV\n", " size[\"nCol\"].add(line[0])\n", "\n", "# Three columns to matrix method \n", "asv_df = pd.DataFrame(index=range(len(size[\"nRow\"])),columns=range(len(size[\"nCol\"])))\n", "asv_df.columns = size[\"nCol\"]\n", "\n", "# Sort the index\n", "asv_index = sorted(list(size[\"nRow\"]))\n", "asv_df.index = asv_index\n", "\n", "# Fill the matrix with the values from the lines\n", "for index, line in enumerate(lines):\n", " if index % 10000 == 0:\n", " print(index, len(lines), end=\"\\r\")\n", " # Clean the sample, asv and value\n", " sample, asv, value = line\n", " sample = sample.strip()\n", " asv = asv.strip()\n", " value = value.strip()\n", " # Set the value to the matrix using the sample and asv as index\n", " asv_df.loc[asv, sample]=value" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Merge the ASV and taxonomy dataframes\n", "result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)\n", "# Remove duplicates\n", "result = result.drop_duplicates(keep=\"first\")\n", "# Fill NaN with 0\n", "result.fillna(0, inplace=True)\n", "\n", "# Rename the rownames\n", "row_names = result.index.values\n", "# Replace the rownames with ASV_1, ASV_2, ...\n", "for index, value in enumerate(row_names):\n", " row_names[index] = \"ASV_\" + str(index + 1)\n", "result.index = row_names\n", "\n", "# Write to file\n", "result.to_csv(\"merged.tsv\", sep=\"\\t\")\n", "\n", "# Load and fix first line\n", "lines = open(\"merged.tsv\").readlines()\n", "# Add header\n", "lines.insert(0, \"# Automatically generated input file for biom conversion\")\n", "# Add OTU ID header\n", "lines[1] = \"#OTU ID\" + lines[1]\n", "# Write to file\n", "output = open(\"merged.tsv\", \"w\")\n", "for line in lines:\n", " print(line.strip(), file=output)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[17], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# Load biom and json\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mbiom\u001b[39;00m \u001b[39mimport\u001b[39;00m load_table\n\u001b[1;32m 3\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mjson\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[39m# Convert merged.tsv to biom file\u001b[39;00m\n", "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/biom/__init__.py:51\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[39mr\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[39mQuick start\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[39m===========\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 41\u001b[0m \n\u001b[1;32m 42\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 43\u001b[0m \u001b[39m# ----------------------------------------------------------------------------\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[39m# Copyright (c) 2011-2020, The BIOM Format Development Team.\u001b[39;00m\n\u001b[1;32m 45\u001b[0m \u001b[39m#\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[39m# The full license is in the file COPYING.txt, distributed with this software.\u001b[39;00m\n\u001b[1;32m 49\u001b[0m \u001b[39m# ----------------------------------------------------------------------------\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mtable\u001b[39;00m \u001b[39mimport\u001b[39;00m Table\n\u001b[1;32m 52\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mparse\u001b[39;00m \u001b[39mimport\u001b[39;00m parse_biom_table \u001b[39mas\u001b[39;00m parse_table, load_table\n\u001b[1;32m 53\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mutil\u001b[39;00m \u001b[39mimport\u001b[39;00m __format_version__, __version__\n", "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/biom/table.py:195\u001b[0m\n\u001b[1;32m 190\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mbiom\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutil\u001b[39;00m \u001b[39mimport\u001b[39;00m (get_biom_format_version_string,\n\u001b[1;32m 191\u001b[0m get_biom_format_url_string, flatten, natsort,\n\u001b[1;32m 192\u001b[0m prefer_self, index_list, H5PY_VLEN_STR, HAVE_H5PY,\n\u001b[1;32m 193\u001b[0m __format_version__)\n\u001b[1;32m 194\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mbiom\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39merr\u001b[39;00m \u001b[39mimport\u001b[39;00m errcheck\n\u001b[0;32m--> 195\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_filter\u001b[39;00m \u001b[39mimport\u001b[39;00m _filter\n\u001b[1;32m 196\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_transform\u001b[39;00m \u001b[39mimport\u001b[39;00m _transform\n\u001b[1;32m 197\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_subsample\u001b[39;00m \u001b[39mimport\u001b[39;00m _subsample\n", "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/biom/_filter.pyx:1\u001b[0m, in \u001b[0;36minit biom._filter\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject" ] } ], "source": [ "# Load biom and json\n", "from biom import load_table\n", "import json\n", "\n", "# Convert merged.tsv to biom file\n", "biom_content = load_table(\"merged.tsv\")\n", "\n", "# Load metadata file\n", "sample_ids = set(biom_content._sample_ids)\n", "metadata = {}\n", "keys = set()\n", "\n", "# Read the met.tsv file\n", "for index, line in enumerate(open(\"met.tsv\")):\n", " line = line.strip()\n", " # Skip empty lines\n", " if \"\\t\" not in line: continue\n", " identifier, key, value = line.split(\"\\t\")\n", " # Bug in biom reader, all metadata need to have the same keys in the same order\n", " keys.add(key)\n", " # Skip lines that are not in this biom object\n", " if identifier not in sample_ids: continue\n", " if identifier not in metadata:\n", " metadata[identifier] = {}\n", " metadata[identifier][key] = str(value)\n", "\n", "# Add missing keys to metadata using a sorted list\n", "keys = sorted(keys)\n", "# Add missing keys to metadata\n", "for identifier in metadata:\n", " for key in keys:\n", " if key not in metadata[identifier]:\n", " metadata[identifier][key] = \"None\"\n", "\n", "# Add metadata\n", "biom_content.add_metadata(metadata)\n", "# Set biom type\n", "biom_content.type = \"OTU table\"\n", "json_data = biom_content.to_json(generated_by=\"UNLOCK conversion module\")\n", "\n", "# Create Python object from JSON string data\n", "obj = json.loads(json_data)\n", "\n", "# Fix taxonomy split issue\n", "for index, row in enumerate(obj[\"rows\"]):\n", " row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split(\"; \")\n", "\n", "# Pretty Print JSON\n", "json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)\n", "\n", "# Create biom file\n", "biom_file = \"merged.biom\"\n", "print(\"Writing biom file to\", biom_file)\n", "print(json_formatted_str, file=open(biom_file, \"w\"))\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Amplicon\n", "\n", "### Picrust data retrieval\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from irods.models import Collection, DataObject\n", "from irods.column import Criterion\n", "import os\n", "\n", "# Standard authentication\n", "with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:\n", " # Set the filters to select the files needed for downloading\n", " results = session.query(Collection, DataObject).filter( \\\n", " Criterion('like', Collection.name, '%STU_BO3B-BR1%')).filter( \\\n", " Criterion('like', DataObject.name, '%_predicted.tsv.gz'))\n", " \n", " for r in results:\n", " # Download the result files\n", " data_file = r[Collection.name] + \"/\" + r[DataObject.name]\n", " # Original path in irods is local path from current location\n", " if not os.path.exists(\".\" + data_file):\n", " os.makedirs(\".\" + r[Collection.name], exist_ok=True)\n", " print(\"Downloading: \" + data_file, end=\"\\r\")\n", " session.data_objects.get(data_file, \".\" + data_file)\n", " else:\n", " print(\"Already received: \" + data_file + \" \"*10, end=\"\\r\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }