From 29ac2c6eaf782f2270c70576b2aa2ee2b8c4ff42 Mon Sep 17 00:00:00 2001 From: keelinm Date: Fri, 24 Nov 2023 20:08:04 +0100 Subject: [PATCH] various updates --- scripts/MergeChanges.ipynb | 137 ++++++++++++++++++++++++++++--------- 1 file changed, 105 insertions(+), 32 deletions(-) diff --git a/scripts/MergeChanges.ipynb b/scripts/MergeChanges.ipynb index 71a82db..7501a12 100644 --- a/scripts/MergeChanges.ipynb +++ b/scripts/MergeChanges.ipynb @@ -2,10 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "d377a755", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\keeli\\anaconda3\\envs\\lit-env\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import pandas as pd\n", "import os\n", @@ -15,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "f6cf2984", "metadata": {}, "outputs": [], @@ -47,44 +56,54 @@ " return df_bib_data" ] }, + { + "cell_type": "markdown", + "id": "5d1be2b7", + "metadata": {}, + "source": [ + "# Read in the LIVE bib file and the updated bib file we want to merge it with" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "5b5596e1", "metadata": {}, "outputs": [], "source": [ "path_diag_bib = os.path.join('..', 'diag.bib')\n", "diag_bib_raw = read_bibfile(None, path_diag_bib)\n", - "bibfile = from_bib_to_csv(diag_bib_raw)\n", - "path_new_bib = os.path.join('script_data/', 'diag_ss_new.bib')\n", + "# convert the live bib file to csv format\n", + "bibfile_as_csv = from_bib_to_csv(diag_bib_raw)\n", + "path_new_bib = os.path.join('script_data/', 'diag_ss_tmp_new.bib')\n", "updated_bib_raw = read_bibfile(None, path_new_bib)\n", - "newbibfile = from_bib_to_csv(updated_bib_raw)" + "# convert the updated bibfile to csv format\n", + "newbibfile_as_csv = from_bib_to_csv(updated_bib_raw)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "98489bba", "metadata": {}, "outputs": [], "source": [ - "new_bibkeys = newbibfile['bibkey'].tolist()" + "new_bibkeys = newbibfile_as_csv['bibkey'].tolist()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "2374dcc9", "metadata": {}, "outputs": [], "source": [ - "old_bibkeys = bibfile['bibkey'].tolist()" + "old_bibkeys = bibfile_as_csv['bibkey'].tolist()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "f38e2c4c", "metadata": {}, "outputs": [], @@ -98,12 +117,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "366debbe", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "\"KM: I propose that the updated bib file is the one we will keep and we only need to check if it misses anything that is in the original\\nfor entry in diag_bib_raw:\\n if entry.type == 'string':\\n continue\\n bibkey = entry.key\\n if bibkey in new_bibkeys:\\n corresponding_entry = get_entry(updated_bib_raw, bibkey)\\n if 'all_ss_ids' in corresponding_entry.fields:\\n entry.fields['all_ss_ids'] = corresponding_entry.fields['all_ss_ids']\\n if 'pmid' in corresponding_entry.fields:\\n entry.fields['pmid'] = corresponding_entry.fields['pmid']\\n\"" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Update existing bibitems\n", + "\"\"\"KM: I propose that the updated bib file is the one we will keep and we only need to check if it misses anything that is in the original\n", "for entry in diag_bib_raw:\n", " if entry.type == 'string':\n", " continue\n", @@ -113,41 +144,75 @@ " if 'all_ss_ids' in corresponding_entry.fields:\n", " entry.fields['all_ss_ids'] = corresponding_entry.fields['all_ss_ids']\n", " if 'pmid' in corresponding_entry.fields:\n", - " entry.fields['pmid'] = corresponding_entry.fields['pmid']" + " entry.fields['pmid'] = corresponding_entry.fields['pmid']\n", + "\"\"\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, + "id": "550580b1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Brou23', 'Twil23c', 'Hump23', 'Anto23a', 'Aswo23', 'Hend23b'}\n" + ] + } + ], + "source": [ + "bibkeys_not_in_updated = set(old_bibkeys)-set(new_bibkeys)\n", + "print(bibkeys_not_in_updated)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "id": "21522e57", "metadata": {}, "outputs": [], "source": [ - "bibkeys_toadd = set(new_bibkeys)-set(old_bibkeys)\n", - "df_to_add = newbibfile[newbibfile['bibkey'].isin(bibkeys_toadd)]" + "# bibkeys_toadd = set(new_bibkeys)-set(old_bibkeys)\n", + "# df_to_add = newbibfile[newbibfile['bibkey'].isin(bibkeys_toadd)]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "3c023620", "metadata": {}, "outputs": [], "source": [ - "bibkeys_toadd" + "# bibkeys_toadd" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "82d6aeeb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "appending entry Twil23c\n", + "appending entry Hend23b\n", + "appending entry Anto23a\n", + "appending entry Brou23\n", + "appending entry Hump23\n", + "appending entry Aswo23\n" + ] + } + ], "source": [ - "for entry in updated_bib_raw:\n", + "for entry in diag_bib_raw:\n", " bibkey = entry.key\n", - " if bibkey in bibkeys_toadd:\n", - " diag_bib_raw.append(entry)" + " if bibkey in bibkeys_not_in_updated:\n", + " print('appending entry', bibkey)\n", + " updated_bib_raw.append(entry)" ] }, { @@ -157,26 +222,34 @@ "metadata": {}, "outputs": [], "source": [ - "csv=from_bib_to_csv(diag_bib_raw)" + "#csv=from_bib_to_csv(diag_bib_raw)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "cf941ddf", "metadata": {}, "outputs": [], "source": [ - "path_diag_bib = os.path.join('..', 'diag_latest_try.bib')\n", - "save_to_file(diag_bib_raw, path_diag_bib)" + "path_diag_bib = os.path.join('..', 'diag_orig_and_ss_merged.bib')\n", + "save_to_file(updated_bib_raw, path_diag_bib)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d491d30", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "lit-env", "language": "python", - "name": "python3" + "name": "lit-env" }, "language_info": { "codemirror_mode": { @@ -188,7 +261,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.18" } }, "nbformat": 4,