{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Amplicon BIOM\n",
    "\n",
    "This document contains many functions for all kinds of amplicon analysis used in unlock"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Connection settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from irods.session import iRODSSession\n",
    "import ssl\n",
    "from irods.meta import iRODSMeta\n",
    "import os, sys\n",
    "\n",
    "# iRODS authentication information\n",
    "host = \"unlock-icat.irods.surfsara.nl\"\n",
    "port = 1247\n",
    "zone = \"unlock\"\n",
    "user = os.getenv('irodsUserName')\n",
    "password = os.getenv('irodsPassword')\n",
    "\n",
    "# SSL settings\n",
    "context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=None, capath=None, cadata=None)\n",
    "\n",
    "ssl_settings = {'irods_client_server_negotiation': 'request_server_negotiation',\n",
    "                'irods_client_server_policy': 'CS_NEG_REQUIRE',\n",
    "                'irods_encryption_algorithm': 'AES-256-CBC',\n",
    "                'irods_encryption_key_size': 32,\n",
    "                'irods_encryption_num_hash_rounds': 16,\n",
    "                'irods_encryption_salt_size': 8,\n",
    "                \"irods_authentication_scheme\": \"pam\",\n",
    "                'ssl_context': context}\n",
    "                "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Amplicon\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Amplicon data retrieval\n",
    "Retrieval of ASV, taxonomy, sequence and metadata information and transformation into a BIOM file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Already received: /unlock/home/wur.unlock.hivghana_drp0070/STU_DRP007092/OBS_XDRS176960/SAM_DRS176960/amplicon/ASY_DRR243924/NGTAX_Silva138.1-picrust_100_DRR243924/4_PHYLOSEQ/DRR243924_seq.tsv          \r"
     ]
    }
   ],
   "source": [
    "from irods.models import Collection, DataObject\n",
    "from irods.column import Criterion\n",
    "import os\n",
    "\n",
    "# Standard authentication\n",
    "with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:\n",
    "    # Set the filters to select the files needed for downloading\n",
    "    results = session.query(Collection, DataObject).filter( \\\n",
    "        Criterion('like', Collection.name, '/unlock/home/wur.unlock.hivghana_drp0070%_PHYLOSEQ'))\n",
    "    \n",
    "    data_files = set()\n",
    "    for index, r in enumerate(results):\n",
    "        # Download the result files\n",
    "        data_file = r[Collection.name] + \"/\" + r[DataObject.name]\n",
    "        data_files.add(data_file)\n",
    "    for index, file_path in enumerate(data_files):\n",
    "        # Original path in irods is local path from current location\n",
    "        folder_path = os.path.dirname(file_path)\n",
    "        if not os.path.exists(\".\" + file_path):\n",
    "            os.makedirs(\".\" + folder_path, exist_ok=True)\n",
    "            print(\"Downloading: \" + str(index) + \" of \" + str(len(data_files)) + \" \" + file_path, end=\"\\r\")\n",
    "            session.data_objects.get(file_path, \".\" + file_path)\n",
    "        else:\n",
    "            print(\"Already received: \" + file_path + \" \"*10, end=\"\\r\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing  35\r"
     ]
    }
   ],
   "source": [
    "destinations = {\n",
    "    \"seq.tsv\" : open(\"seq.tsv\", \"w\"), \n",
    "    \"asv.tsv\" : open(\"asv.tsv\", \"w\"),\n",
    "    \"tax.tsv\" : open(\"tax.tsv\", \"w\"),\n",
    "    \"met.tsv\" : open(\"met.tsv\", \"w\")\n",
    "}\n",
    "\n",
    "# Merge all tsv files into the appropiate file object\n",
    "for index, data_file in enumerate(data_files):\n",
    "    print(\"Processing \", index, end=\"\\r\")\n",
    "    id = data_file.split(\"_\")[-1]\n",
    "    content = open(\".\" + data_file).read()\n",
    "    print(content, file=destinations[id])\n",
    "\n",
    "# Close all destination files\n",
    "for destination in destinations:\n",
    "    destinations[destination].close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                                                             taxonomy\n",
      "ID                                                                                                   \n",
      "                                                      k__NA; p__NA; c__NA; o__NA; f__NA; g__NA; s__NA\n",
      "01a0bec405b65f107a84eb1bdaf0ccbbcc177f487bab236...  k__Bacteria; p__Firmicutes; c__Bacilli; o__RF3...\n",
      "01dc299b033cd3ddc735f033312a8ca6bfdee54fc12c2f9...  k__Bacteria; p__Firmicutes; c__Clostridia; o__...\n",
      "02db5a224a4f92228a2e78129f5ce999bde8f9d40fba844...  k__Bacteria; p__Firmicutes; c__Clostridia; o__...\n",
      "04a66337670a05caa7ff58d5d9516acac7646344dea7356...  k__Bacteria; p__Firmicutes; c__Clostridia; o__...\n"
     ]
    }
   ],
   "source": [
    "# Process taxonomy\n",
    "import pandas as pd\n",
    "\n",
    "lines = []\n",
    "for line in open(\"tax.tsv\"):\n",
    "    line = line.strip().split(\"\\t\")\n",
    "    if len(line) < 8:\n",
    "        line = line[:8] + [\"NA\"]*(8 - len(line))\n",
    "    formatted = [line[0], \"k__\" + line[1]+ \"; p__\" + line[2]+ \"; c__\" + line[3]+ \"; o__\" + line[4]+ \"; f__\" + line[5]+ \"; g__\" + line[6]+ \"; s__\" + line[7]]\n",
    "    lines.append(formatted)\n",
    "tax_df = pd.DataFrame(lines)\n",
    "tax_df.columns = [\"ID\", \"taxonomy\"]\n",
    "tax_df = tax_df.set_index(\"ID\")\n",
    "\n",
    "print(tax_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 8800\r"
     ]
    }
   ],
   "source": [
    "# Load and transform the ASV file into a matrix\n",
    "lines = []\n",
    "size = {\"nRow\": set(), \"nCol\": set()}\n",
    "# Read the asv.tsv file\n",
    "for line in open(\"asv.tsv\"):\n",
    "    line = line.strip().split(\"\\t\")\n",
    "    # Skip header\n",
    "    if len(line) == 1: continue\n",
    "    # Add to lines\n",
    "    lines.append(line)\n",
    "    # Row sample\n",
    "    size[\"nRow\"].add(line[1])\n",
    "    # Column ASV\n",
    "    size[\"nCol\"].add(line[0])\n",
    "\n",
    "# Three columns to matrix method    \n",
    "asv_df = pd.DataFrame(index=range(len(size[\"nRow\"])),columns=range(len(size[\"nCol\"])))\n",
    "asv_df.columns = size[\"nCol\"]\n",
    "\n",
    "# Sort the index\n",
    "asv_index = sorted(list(size[\"nRow\"]))\n",
    "asv_df.index = asv_index\n",
    "\n",
    "# Fill the matrix with the values from the lines\n",
    "for index, line in enumerate(lines):\n",
    "    if index % 10000 == 0:\n",
    "        print(index, len(lines), end=\"\\r\")\n",
    "    # Clean the sample, asv and value\n",
    "    sample, asv, value = line\n",
    "    sample = sample.strip()\n",
    "    asv = asv.strip()\n",
    "    value = value.strip()\n",
    "    # Set the value to the matrix using the sample and asv as index\n",
    "    asv_df.loc[asv, sample]=value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge the ASV and taxonomy dataframes\n",
    "result = pd.merge(asv_df, tax_df, left_index=True, right_index=True)\n",
    "# Remove duplicates\n",
    "result = result.drop_duplicates(keep=\"first\")\n",
    "# Fill NaN with 0\n",
    "result.fillna(0, inplace=True)\n",
    "\n",
    "# Rename the rownames\n",
    "row_names = result.index.values\n",
    "# Replace the rownames with ASV_1, ASV_2, ...\n",
    "for index, value in enumerate(row_names):\n",
    "    row_names[index] = \"ASV_\" + str(index + 1)\n",
    "result.index = row_names\n",
    "\n",
    "# Write to file\n",
    "result.to_csv(\"merged.tsv\", sep=\"\\t\")\n",
    "\n",
    "# Load and fix first line\n",
    "lines = open(\"merged.tsv\").readlines()\n",
    "# Add header\n",
    "lines.insert(0, \"# Automatically generated input file for biom conversion\")\n",
    "# Add OTU ID header\n",
    "lines[1] = \"#OTU ID\" + lines[1]\n",
    "# Write to file\n",
    "output = open(\"merged.tsv\", \"w\")\n",
    "for line in lines:\n",
    "    print(line.strip(), file=output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[17], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m# Load biom and json\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mbiom\u001b[39;00m \u001b[39mimport\u001b[39;00m load_table\n\u001b[1;32m      3\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mjson\u001b[39;00m\n\u001b[1;32m      5\u001b[0m \u001b[39m# Convert merged.tsv to biom file\u001b[39;00m\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/biom/__init__.py:51\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[39mr\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[39mQuick start\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[39m===========\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     41\u001b[0m \n\u001b[1;32m     42\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m     43\u001b[0m \u001b[39m# ----------------------------------------------------------------------------\u001b[39;00m\n\u001b[1;32m     44\u001b[0m \u001b[39m# Copyright (c) 2011-2020, The BIOM Format Development Team.\u001b[39;00m\n\u001b[1;32m     45\u001b[0m \u001b[39m#\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     48\u001b[0m \u001b[39m# The full license is in the file COPYING.txt, distributed with this software.\u001b[39;00m\n\u001b[1;32m     49\u001b[0m \u001b[39m# ----------------------------------------------------------------------------\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mtable\u001b[39;00m \u001b[39mimport\u001b[39;00m Table\n\u001b[1;32m     52\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mparse\u001b[39;00m \u001b[39mimport\u001b[39;00m parse_biom_table \u001b[39mas\u001b[39;00m parse_table, load_table\n\u001b[1;32m     53\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mutil\u001b[39;00m \u001b[39mimport\u001b[39;00m __format_version__, __version__\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/biom/table.py:195\u001b[0m\n\u001b[1;32m    190\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mbiom\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutil\u001b[39;00m \u001b[39mimport\u001b[39;00m (get_biom_format_version_string,\n\u001b[1;32m    191\u001b[0m                        get_biom_format_url_string, flatten, natsort,\n\u001b[1;32m    192\u001b[0m                        prefer_self, index_list, H5PY_VLEN_STR, HAVE_H5PY,\n\u001b[1;32m    193\u001b[0m                        __format_version__)\n\u001b[1;32m    194\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mbiom\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39merr\u001b[39;00m \u001b[39mimport\u001b[39;00m errcheck\n\u001b[0;32m--> 195\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_filter\u001b[39;00m \u001b[39mimport\u001b[39;00m _filter\n\u001b[1;32m    196\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_transform\u001b[39;00m \u001b[39mimport\u001b[39;00m _transform\n\u001b[1;32m    197\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_subsample\u001b[39;00m \u001b[39mimport\u001b[39;00m _subsample\n",
      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/biom/_filter.pyx:1\u001b[0m, in \u001b[0;36minit biom._filter\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject"
     ]
    }
   ],
   "source": [
    "# Load biom and json\n",
    "from biom import load_table\n",
    "import json\n",
    "\n",
    "# Convert merged.tsv to biom file\n",
    "biom_content = load_table(\"merged.tsv\")\n",
    "\n",
    "# Load metadata file\n",
    "sample_ids = set(biom_content._sample_ids)\n",
    "metadata = {}\n",
    "keys = set()\n",
    "\n",
    "# Read the met.tsv file\n",
    "for index, line in enumerate(open(\"met.tsv\")):\n",
    "    line = line.strip()\n",
    "    # Skip empty lines\n",
    "    if \"\\t\" not in line: continue\n",
    "    identifier, key, value = line.split(\"\\t\")\n",
    "    # Bug in biom reader, all metadata need to have the same keys in the same order\n",
    "    keys.add(key)\n",
    "    # Skip lines that are not in this biom object\n",
    "    if identifier not in sample_ids: continue\n",
    "    if identifier not in metadata:\n",
    "        metadata[identifier] = {}\n",
    "    metadata[identifier][key] = str(value)\n",
    "\n",
    "# Add missing keys to metadata using a sorted list\n",
    "keys = sorted(keys)\n",
    "# Add missing keys to metadata\n",
    "for identifier in metadata:\n",
    "    for key in keys:\n",
    "        if key not in metadata[identifier]:\n",
    "            metadata[identifier][key] = \"None\"\n",
    "\n",
    "# Add metadata\n",
    "biom_content.add_metadata(metadata)\n",
    "# Set biom type\n",
    "biom_content.type = \"OTU table\"\n",
    "json_data = biom_content.to_json(generated_by=\"UNLOCK conversion module\")\n",
    "\n",
    "# Create Python object from JSON string data\n",
    "obj = json.loads(json_data)\n",
    "\n",
    "# Fix taxonomy split issue\n",
    "for index, row in enumerate(obj[\"rows\"]):\n",
    "    row['metadata']['taxonomy'] = row['metadata']['taxonomy'].split(\"; \")\n",
    "\n",
    "# Pretty Print JSON\n",
    "json_formatted_str = json.dumps(obj, indent=4, sort_keys=True)\n",
    "\n",
    "# Create biom file\n",
    "biom_file = \"merged.biom\"\n",
    "print(\"Writing biom file to\", biom_file)\n",
    "print(json_formatted_str, file=open(biom_file, \"w\"))\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Amplicon\n",
    "\n",
    "### Picrust data retrieval\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from irods.models import Collection, DataObject\n",
    "from irods.column import Criterion\n",
    "import os\n",
    "\n",
    "# Standard authentication\n",
    "with iRODSSession(host = host, port = port, user = user, password = password, zone = zone, **ssl_settings) as session:\n",
    "    # Set the filters to select the files needed for downloading\n",
    "    results = session.query(Collection, DataObject).filter( \\\n",
    "        Criterion('like', Collection.name, '%STU_BO3B-BR1%')).filter( \\\n",
    "        Criterion('like', DataObject.name, '%_predicted.tsv.gz'))\n",
    "    \n",
    "    for r in results:\n",
    "        # Download the result files\n",
    "        data_file = r[Collection.name] + \"/\" + r[DataObject.name]\n",
    "        # Original path in irods is local path from current location\n",
    "        if not os.path.exists(\".\" + data_file):\n",
    "            os.makedirs(\".\" + r[Collection.name], exist_ok=True)\n",
    "            print(\"Downloading: \" + data_file, end=\"\\r\")\n",
    "            session.data_objects.get(data_file, \".\" + data_file)\n",
    "        else:\n",
    "            print(\"Already received: \" + data_file + \" \"*10, end=\"\\r\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}