diff --git a/src/notebooks/R Examples/Search for Samples or Studies.ipynb b/src/notebooks/R Examples/Search for Samples or Studies.ipynb index be84212..8fe856f 100644 --- a/src/notebooks/R Examples/Search for Samples or Studies.ipynb +++ b/src/notebooks/R Examples/Search for Samples or Studies.ipynb @@ -4,12 +4,15 @@ "cell_type": "raw", "id": "c77864f8-222e-4907-94f7-dc6711e2f7a6", "metadata": { + "jupyter": { + "source_hidden": true + }, "tags": [] }, "source": [ "---\n", "title: \"Search for Samples or Studies\"\n", - "author: \"Sandy R (MGnify team)\"\n", + "author: \"Sandy R (MGnify team) and Ben Allen\"\n", "categories: [R]\n", "execute: \n", " enabled: true\n", @@ -51,7 +54,9 @@ "cell_type": "code", "execution_count": null, "id": "6cfcb483-889c-4d26-b2e2-c325f8a66283", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "library(IRdisplay)\n", @@ -70,7 +75,9 @@ "cell_type": "code", "execution_count": null, "id": "bb99acc5-6bfe-4d93-b258-6885505df8db", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "library(vegan)\n", @@ -121,18 +128,21 @@ "tags": [] }, "source": [ - "## Example: find Polar samples " + "## Example: find Polar samples \n", + "In these examples we set `maxhits=1` to retrieve only the first page of results. You can change the limit or set it to `-1` to retrieve all samples matching the query." ] }, { "cell_type": "code", "execution_count": null, "id": "83e3d3f1-701b-4528-a6a6-44ac8637f385", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "samps_np <- mgnify_query(mg, \"samples\", latitude_gte=88, maxhits=-1)\n", - "samps_sp <- mgnify_query(mg, \"samples\", latitude_lte=-88, maxhits=-1)\n", + "samps_np <- mgnify_query(mg, \"samples\", latitude_gte=88, maxhits=1)\n", + "samps_sp <- mgnify_query(mg, \"samples\", latitude_lte=-88, maxhits=1)\n", "samps_polar <- rbind(samps_np, samps_sp)" ] }, @@ -140,7 +150,9 @@ "cell_type": "code", "execution_count": null, "id": "bbeaf601-0b08-4954-a169-41c2a5971f18", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "head(samps_polar)" @@ -158,17 +170,21 @@ "cell_type": "code", "execution_count": null, "id": "1587a68f-d855-4bb8-a540-829aebc80f44", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "studies_ww <- mgnify_query(mg, \"studies\", biome_name=\"wastewater\", maxhits=-1)" + "studies_ww <- mgnify_query(mg, \"studies\", biome_name=\"wastewater\", maxhits=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "d825d7a4-319c-419a-b5c0-9b2bab3dd87a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "head(studies_ww)" @@ -179,109 +195,45 @@ "id": "f6edc0c5-48a8-4b8d-a227-b82991007935", "metadata": {}, "source": [ - "## More Sample filters" - ] - }, - { - "cell_type": "markdown", - "id": "e975a310-262a-4870-b208-35b8c7868133", - "metadata": {}, - "source": [ - "### By location" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cea4da31-788d-4f1d-8f22-fa5bf131dd07", - "metadata": {}, - "outputs": [], - "source": [ - "more_northerly_than <- mgnify_query(mg, \"samples\", latitude_gte=88, maxhits=-1)\n", + "## More filters to try:\n", "\n", - "more_southerly_than <- mgnify_query(mg, \"samples\", latitude_lte=-88, maxhits=-1)\n", + "### Samples by location\n", "\n", - "more_easterly_than <- mgnify_query(mg, \"samples\", longitude_gte=170, maxhits=-1)\n", + "```R\n", + "more_northerly_than <- mgnify_query(mg, \"samples\", latitude_gte=88, maxhits=1)\n", "\n", - "more_westerly_than <- mgnify_query(mg, \"samples\", longitude_lte=170, maxhits=-1)\n", + "more_southerly_than <- mgnify_query(mg, \"samples\", latitude_lte=-88, maxhits=1)\n", "\n", - "at_location <- mgnify_query(mg, \"samples\", geo_loc_name=\"usa\", maxhits=-1)" - ] - }, - { - "cell_type": "markdown", - "id": "612eae01-4049-4f8e-90bc-866aedffd908", - "metadata": {}, - "source": [ - "### By biome" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97075a7b-3159-42c0-96c4-6a517370b1c4", - "metadata": {}, - "outputs": [], - "source": [ - "biome_within_wastewater <- mgnify_query(mg, \"samples\", biome_name=\"wastewater\", maxhits=-1)" - ] - }, - { - "cell_type": "markdown", - "id": "533541ce-b64e-43a5-984b-0e13970bc2cb", - "metadata": {}, - "source": [ - "### By metadata\n", + "more_easterly_than <- mgnify_query(mg, \"samples\", longitude_gte=170, maxhits=1)\n", + "\n", + "more_westerly_than <- mgnify_query(mg, \"samples\", longitude_lte=170, maxhits=1)\n", + "\n", + "at_location <- mgnify_query(mg, \"samples\", geo_loc_name=\"usa\", maxhits=1)\n", + "```\n", + "\n", + "### Samples by biome\n", + "```R\n", + "biome_within_wastewater <- mgnify_query(mg, \"samples\", biome_name=\"wastewater\", maxhits=1)\n", + "```\n", + "\n", + "### Samples by metadata\n", "There are a large number of metadata key:value pairs, because these are author-submitted, along with the samples, to the ENA archive.\n", "\n", - "If you know how to specify the metadata key:value query for the samples you're interested in, you can use this form to find matching Samples:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b2a26cb-98ce-483c-88d1-c6b0fe149030", - "metadata": {}, - "outputs": [], - "source": [ - "from_ex_smokers <- mgnify_query(mg, \"samples\", metadata_key=\"smoker\", metadata_value=\"ex-smoker\", maxhits=-1)" - ] - }, - { - "cell_type": "markdown", - "id": "c554cea5-5ccd-44ed-bfef-a790eea6bfed", - "metadata": {}, - "source": [ + "If you know how to specify the metadata key:value query for the samples you're interested in, you can use this form to find matching Samples:\n", + "\n", + "```R\n", + "from_ex_smokers <- mgnify_query(mg, \"samples\", metadata_key=\"smoker\", metadata_value=\"ex-smoker\", maxhits=-1)\n", + "```\n", + "To find `metadata_key`s and values, it is best to browse the [interactive API Browser](https://www.ebi.ac.uk/metagenomics/v1/samples), and use the `Filters` button to construct queries interactively at first.\n", + "\n", + "### Studies by centre name\n", + "```R\n", + "from_smithsonian <- mgnify_query(mg, \"studies\", centre_name=\"Smithsonian\", maxhits=-1)\n", + "```\n", + "\n", "To find `metadata_key`s and values, it is best to browse the [interactive API Browser](https://www.ebi.ac.uk/metagenomics/v1/samples), and use the `Filters` button to construct queries interactively at first." ] }, - { - "cell_type": "markdown", - "id": "6eb4e359-5905-4722-ab88-65a9b3e6e9c2", - "metadata": {}, - "source": [ - "--- \n", - "## More Study filters" - ] - }, - { - "cell_type": "markdown", - "id": "0d3ea380-2f3f-4fdd-97c3-eef9d24a3800", - "metadata": {}, - "source": [ - "### By Centre Name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c670e399-7297-44f3-951b-58040a9ac0d4", - "metadata": {}, - "outputs": [], - "source": [ - "from_smithsonian <- mgnify_query(mg, \"studies\", centre_name=\"Smithsonian\", maxhits=-1)" - ] - }, { "cell_type": "markdown", "id": "fe1f615b-68ec-4b60-bd87-74d3f6fe016b", @@ -289,6 +241,7 @@ "source": [ "---\n", "\n", + "\n", "## Example: adding additional filters to the data frame" ] }, @@ -304,7 +257,9 @@ "cell_type": "code", "execution_count": null, "id": "1dd0f6c8-0430-4ad2-adef-fb7c5c776fa5", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "lentic_samples <- mgnify_query(mg, \"samples\", biome_name=\"root:Environmental:Aquatic:Lentic\", usecache=T)" @@ -315,14 +270,16 @@ "id": "7d5c00be-be94-4368-9141-2ec66e8e16ce", "metadata": {}, "source": [ - "Not, also filter by depth *within* the returned results, using normal R syntax." + "Now, also filter by depth *within* the returned results, using normal R syntax." ] }, { "cell_type": "code", "execution_count": null, "id": "8e0a11f8-bb7c-4cd6-a314-9cb3ed7b4e0f", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "depth_numeric = as.numeric(lentic_samples$depth) # We must convert data from MGnifyR (always strings) to numerical format.\n",