From 8749b41af13f3d719016b80ef2a44b9215c3621d Mon Sep 17 00:00:00 2001
From: Ashneet <ashneetnagi21@gmail.com>
Date: Tue, 19 Dec 2023 16:28:22 -0700
Subject: [PATCH 1/9] Added the Colab

---
 .../Ego4D_AudioVisual.ipynb                   | 537 ++++++++++++++++++
 1 file changed, 537 insertions(+)
 create mode 100644 extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb

diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
new file mode 100644
index 00000000..4f40a145
--- /dev/null
+++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
@@ -0,0 +1,537 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ad3Kas-mn3Ke"
+      },
+      "source": [
+        "**EGO4D Audio-Visual Diarization Benchmark**\n",
+        "- This notebook allows a quickstart into the EGO4D Audio Visual Repo\n",
+        "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n",
+        "- Hardware accelerator could be T4GPU, V100 or A100 GPU\n",
+        "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab, the changes are commented out and are reversible"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Install Dependencies"
+      ],
+      "metadata": {
+        "id": "hIBjA7mUpK4F"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lhUO_Ov678bm"
+      },
+      "outputs": [],
+      "source": [
+        "!apt install ffmpeg python3-pip git\n",
+        "!pip install ego4d awscli numpy opencv-python pyqt5 opencv-contrib-python libtorch torchvision torchaudio\n",
+        "!sudo apt-get install libavcodec-dev libavformat-dev libswscale-dev libv4l-dev\n",
+        "!sudo apt-get install libxvidcore-dev libx264-dev\n",
+        "!sudo apt install libgtk2.0-dev liblcm-dev\n",
+        "!sudo apt-get install liblcm-dev\n",
+        "!pip install pydub audiosegment"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qssel_-R8d-d"
+      },
+      "outputs": [],
+      "source": [
+        "# Install libtorch\n",
+        "%cd /content/egocentric\n",
+        "!wget \"https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu118.zip\"\n",
+        "!unzip \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"\n",
+        "!rm -rf \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Clone Repository & download the videos"
+      ],
+      "metadata": {
+        "id": "fbt09eafpoET"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "o77Hf1uJ8Fei"
+      },
+      "outputs": [],
+      "source": [
+        "# Create the new egocentric directory\n",
+        "!mkdir egocentric\n",
+        "%cd /content/egocentric\n",
+        "\n",
+        "# Clone the audio-visualrepo\n",
+        "!git clone https://github.com/ashneet1/audio-visual.git\n",
+        "%cd /content/egocentric/audio-visual\n",
+        "!mkdir data\n",
+        "\n",
+        "# List the video uids to download\n",
+        "!touch video_uids.txt\n",
+        "!echo \"0b4cacb1-970f-4ef0-85da-371d81f899e0\" >> video_uids.txt\n",
+        "!echo \"c2413391-7c1b-4fd6-8b1d-98ee7888b9f8\" >> video_uids.txt\n",
+        "!echo \"fe69a78e-7773-45d1-9e0f-bacee52dac83\" >> video_uids.txt\n",
+        "!echo \"3b79017c-4d42-40fc-a1bb-4a20bc8ebca7\" >> video_uids.txt\n",
+        "!echo \"6dbfc053-7899-40d8-9827-0ccd21f3ee0a\" >> video_uids.txt\n",
+        "!echo \"7e6dfd31-8544-4fad-9e49-0f05516cf8cf\" >> video_uids.txt\n",
+        "!echo \"56c5af79-f9d4-478d-96ef-6d71e0bbbdfe\" >> video_uids.txt\n",
+        "!echo \"d97bedc8-72df-43be-a55b-4da1ae42dfd1\" >> video_uids.txt\n",
+        "!echo \"f0cb79ef-c081-4049-85ef-2623e02c9589\" >> video_uids.txt\n",
+        "!echo \"08b0935e-6260-4bd6-86ca-f6fc54e388be\" >> video_uids.txt\n",
+        "!echo \"6b34c327-000c-42b6-b242-d3dca63a7508\" >> video_uids.txt\n",
+        "!echo \"076bdb81-5c75-4282-9f3a-a387624575f3\" >> video_uids.txt\n",
+        "\n",
+        "# Configure aws cli to be able to access the ego4d dataset\n",
+        "!aws configure\n",
+        "\n",
+        "# Download ego4d model, annotation, and videos\n",
+        "!ego4d -y --output_directory ./data  --datasets av_models clips annotations --benchmarks av --video_uid_file video_uids.txt\n",
+        "!tar xf data/v2/av_models/pretrained_av_models.tar.gz\n",
+        "!mv data/v2/annotations/* utils/ground_truth"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#EGO4D Audio-Visual Diarization Baseline"
+      ],
+      "metadata": {
+        "id": "Mlbjm9kPvqLr"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Preprocess ground truth data"
+      ],
+      "metadata": {
+        "id": "d4opZblNpyHC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IM6020Xg8aSF"
+      },
+      "outputs": [],
+      "source": [
+        "# Preprocess ground truth data\n",
+        "%cd /content/egocentric/audio-visual/utils/ground_truth\n",
+        "!bash init_dirs.sh\n",
+        "!python3 extract_clipnames_and_split_indices.py\n",
+        "!python3 extract_boxes_and_speakers.py\n",
+        "!python3 make_mot_ground_truth.py ../../data/v2/clips val\n",
+        "!mv tracking_evaluation/mot_challenge ../../tracking/tracking_evaluation/data/gt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jt7yzV648gOr",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "acf51bc2-fb54-4769-c20a-5e7d1b53d25f"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content/egocentric/audio-visual/utils/ground_truth\n"
+          ]
+        }
+      ],
+      "source": [
+        "#Run visualize_ground_truth.py (It downloads the output video to the current directory)\n",
+        "%cd /content/egocentric/audio-visual/utils/ground_truth/\n",
+        "!python3 visualize_ground_truth.py  /content/egocentric/audio-visual/data/v2/clips 0b4cacb1-970f-4ef0-85da-371d81f899e0 #This is 389"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Localization & Tracking"
+      ],
+      "metadata": {
+        "id": "c2fNByhuqTHn"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "###People Detection Setup"
+      ],
+      "metadata": {
+        "id": "ZPQT5dMUqZjo"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZqOp5PWA8nNR"
+      },
+      "outputs": [],
+      "source": [
+        "# People detection setup\n",
+        "# https://github.com/EGO4D/audio-visual/blob/main/tracking/README.md#people-detection\n",
+        "%cd /content/egocentric/audio-visual/tracking/people_detection\n",
+        "\n",
+        "# Replace the lines in the makefile to use opencv4 instead of opencv3\n",
+        "!sed -i '44s/.*/LDFLAGS+= `pkg-config --libs opencv4` -lstdc++/' Makefile\n",
+        "!sed -i '45s/.*/COMMON+= `pkg-config --cflags opencv4`/' Makefile\n",
+        "\n",
+        "# Add missing headers required to build using opencv4\n",
+        "# https://stackoverflow.com/questions/64885148/error-iplimage-does-not-name-a-type-when-trying-to-build-darknet-with-opencv\n",
+        "!sed -i '3 i #include \"opencv2/core/core_c.h\"' src/image_opencv.cpp\n",
+        "!sed -i '3 i #include \"opencv2/videoio/legacy/constants_c.h\"' src/image_opencv.cpp\n",
+        "!sed -i '3 i #include \"opencv2/highgui/highgui_c.h\"' src/image_opencv.cpp\n",
+        "\n",
+        "!sed -i '3 i #include \"opencv2/core/core_c.h\"' src/image_opencv.hpp\n",
+        "!sed -i '3 i #include \"opencv2/videoio/legacy/constants_c.h\"' src/image_opencv.hpp\n",
+        "!sed -i '3 i #include \"opencv2/highgui/highgui_c.h\"' src/image_opencv.hpp\n",
+        "\n",
+        "!make -j"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "###Short Term Tracking"
+      ],
+      "metadata": {
+        "id": "vqidCROwqfJA"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2gpLUYd68vE2"
+      },
+      "outputs": [],
+      "source": [
+        "# Short term tracking setup\n",
+        "# https://github.com/EGO4D/audio-visual/blob/main/tracking/README.md#short_term_tracking\n",
+        "%cd /content/egocentric/audio-visual/tracking/short_term_tracking\n",
+        "\n",
+        "# Modify line 13 in the CMake file to include the opencv4 directory\n",
+        "# https://stackoverflow.com/questions/58478074/how-to-fix-fatal-error-opencv2-core-hpp-no-such-file-or-directory-for-opencv\n",
+        "!sed -i '13s,.*,include_directories( /usr/local/include /usr/local/cuda/include /usr/include/opencv4/ ),' CMakeLists.txt\n",
+        "\n",
+        "# Modify line 17 in the CMake file to fix a compilation error\n",
+        "# https://github.com/pytorch/pytorch/issues/103371\n",
+        "!sed -i '17s,.*,set_property(TARGET short_term_tracker PROPERTY CXX_STANDARD 17),' CMakeLists.txt\n",
+        "!mkdir build\n",
+        "%cd build\n",
+        "!cmake -DCMAKE_PREFIX_PATH=/content/egocentric/libtorch ..\n",
+        "!make"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "###Run Global People Tracking"
+      ],
+      "metadata": {
+        "id": "jIfxzYDgqlkf"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "D-a-ez1A85EB"
+      },
+      "outputs": [],
+      "source": [
+        "#Global People Tracking\n",
+        "%cd /content/egocentric/audio-visual/tracking\n",
+        "!python3 single_run.py /content/egocentric/audio-visual/data/v2/clips 438"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Voice Activity Detection (VAD)"
+      ],
+      "metadata": {
+        "id": "uG8NV_oUrDZY"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SxlxvS0Y81BU"
+      },
+      "outputs": [],
+      "source": [
+        "#Voice Activity Audio Detection\n",
+        "%cd /content/egocentric/audio-visual/active-speaker-detection/vad\n",
+        "!python3 extract_all_audio.py /content/egocentric/audio-visual/data/v2/clips\n",
+        "!python3 vad.py"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Active Speaker Detection (ASD)"
+      ],
+      "metadata": {
+        "id": "rVr2OzKIrRBY"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "####Mouth Region Classification (MRC)"
+      ],
+      "metadata": {
+        "id": "2BN1eTm_rePW"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ipOe-4y1883Q"
+      },
+      "outputs": [],
+      "source": [
+        "# Active Speaker Detection (ASD)\n",
+        "# Mouth region classification (MRC)\n",
+        "%cd /content/egocentric/audio-visual/active-speaker-detection/active_speaker/mrc_active_speaker_detection/prediction\n",
+        "\n",
+        "# Modify line 15 and 17 in the CMake file to include the opencv4 directory\n",
+        "# https://stackoverflow.com/questions/58478074/how-to-fix-fatal-error-opencv2-core-hpp-no-such-file-or-directory-for-opencv\n",
+        "!sed -i '15s,.*,link_directories( /usr/local/lib /usr/local/cuda/lib64 /usr/include/opencv4/ ),' CMakeLists.txt\n",
+        "!sed -i '17s,.*,include_directories( /usr/local/include /usr/local/cuda/include /usr/local/cuda/targets/x86_64-linux/include /usr/include/opencv4/ ),' CMakeLists.txt\n",
+        "\n",
+        "# Modify line 21 in the CMake file to fix a compilation error\n",
+        "# https://github.com/pytorch/pytorch/issues/103371\n",
+        "!sed -i '21s,.*,set_property(TARGET mrc PROPERTY CXX_STANDARD 17),' CMakeLists.txt\n",
+        "# Build MRC tracking code\n",
+        "!mkdir build\n",
+        "%cd build\n",
+        "!cmake -DCMAKE_PREFIX_PATH=/content/egocentric/libtorch ..\n",
+        "!make"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FcFzpREn8_1P"
+      },
+      "outputs": [],
+      "source": [
+        "#Running the MRC\n",
+        "%cd /content/egocentric/audio-visual/active-speaker-detection/active_speaker/mrc_active_speaker_detection/prediction\n",
+        "!python3 run_once.py /content/egocentric/audio-visual/data/v2/clips ego4d 389"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Audio Embedding"
+      ],
+      "metadata": {
+        "id": "PY4Mha_QrjV-"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_OzROruu9CVO"
+      },
+      "outputs": [],
+      "source": [
+        "#Voice Embedding\n",
+        "%cd /content/egocentric/audio-visual/active-speaker-detection/audio_embedding/make_audio_embeddings\n",
+        "!python3 batch_audio_embedding.py /content/egocentric/audio-visual/data/v2/clips val"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Device wearer voice activity detection"
+      ],
+      "metadata": {
+        "id": "XXUYRiQ1rueJ"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "####Energy Based Method"
+      ],
+      "metadata": {
+        "id": "kssnc4VNr4ra"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LMZ3jbsM9E5c"
+      },
+      "outputs": [],
+      "source": [
+        "#Wearer: energy based method\n",
+        "%cd /content/egocentric/audio-visual/active-speaker-detection/wearer/energy_based\n",
+        "!python3 short_time_energy.py /content/egocentric/audio-visual/data/v2/clips val\n",
+        "!python3 match_wearer_audio.py val"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "##Surrounding people voice matching (MRC)"
+      ],
+      "metadata": {
+        "id": "pQnNzUoasJjx"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wOTCIPGi9Mpn"
+      },
+      "outputs": [],
+      "source": [
+        "#Surrounding People Audio Matching: MRC\n",
+        "%cd /content/egocentric/audio-visual/active-speaker-detection/surrounding_people_audio_matching/mrc\n",
+        "!python3 match_audio.py /content/egocentric/audio-visual/active-speaker-detection/active_speaker/mrc_active_speaker_detection/prediction/results val"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7rVXK-W5b_Nu"
+      },
+      "source": [
+        "#Transcription\n",
+        "- Need to move \"av_test_unannotated.json\",\"av_train.json\", and \"av_val.json\" from /content/egocentric/audio-visual/utils/ground_truth to /content/egocentric/audio-visual/data\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gWtnMbJrbybQ"
+      },
+      "outputs": [],
+      "source": [
+        "#Install Miniconda\n",
+        "%cd /content/\n",
+        "#https://www.kaggle.com/code/alaajah/creating-virtual-environment-on-google-colab\n",
+        "!wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n",
+        "!chmod +x Miniconda3-latest-Linux-x86_64.sh\n",
+        "!./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local\n",
+        "!conda install -q -y --prefix /usr/local python=3.8.10 ujson"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IvLBi0e_cEq8"
+      },
+      "outputs": [],
+      "source": [
+        "#Install Sclite\n",
+        "%cd /content/\n",
+        "!git clone https://github.com/usnistgov/SCTK.git\n",
+        "%cd SCTK\n",
+        "! make config\n",
+        "! make all\n",
+        "! make check\n",
+        "! make install\n",
+        "! make doc"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "To36j1F_cLIC"
+      },
+      "outputs": [],
+      "source": [
+        "#Activate Environment\n",
+        "%cd /content/drive/MyDrive/egocentric/audio-visual/transcription\n",
+        "!conda create --name transcription_env --file requirements_38_10.txt\n",
+        "!pip install soundfile\n",
+        "!pip install torch\n",
+        "!pip install espnet_model_zoo"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "d7do0eXVcQRr"
+      },
+      "outputs": [],
+      "source": [
+        "#Extract 16kHz single channel audio files in wav format from videos\n",
+        "%cd /content/egocentric/audio-visual/data/v2/clips\n",
+        "!mkdir wavs_16000\n",
+        "%cd /content/egocentric/audio-visual/transcription\n",
+        "!chmod +x extract_wav.sh\n",
+        "!./extract_wav.sh /content/egocentric/audio-visual/data/v2/clips /content/egocentric/audio-visual/data/v2/wavs_16000"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "okmVTCt4cadk"
+      },
+      "outputs": [],
+      "source": [
+        "#Extract transcriptions from the annotation files, decode audio and score the decoding output\n",
+        "%cd /content/egocentric/audio-visual/transcription\n",
+        "!pip install torchaudio\n",
+        "!chmod +x score_asr.sh\n",
+        "!./score_asr.sh /content/egocentric/audio-visual/transcription/output 1"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": [],
+      "gpuType": "V100"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file

From 23da5894d50b9d350579ac19db06e4769b93690f Mon Sep 17 00:00:00 2001
From: Ashneet <ashneetnagi21@gmail.com>
Date: Tue, 19 Dec 2023 16:51:11 -0700
Subject: [PATCH 2/9] Create README.md

---
 extensions/labgraph_diarization/README.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 extensions/labgraph_diarization/README.md

diff --git a/extensions/labgraph_diarization/README.md b/extensions/labgraph_diarization/README.md
new file mode 100644
index 00000000..e437e011
--- /dev/null
+++ b/extensions/labgraph_diarization/README.md
@@ -0,0 +1,5 @@
+#EGO4D Audio-Visual Diarization Benchmark Colab
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UBGYBNrjJckCb0KoyQ6xXea79kuoLgu6#scrollTo=lhUO_Ov678bm]
+
+The Colab downloads a subset of video clips from the EGO4D Dataset and performs Audio-Visual Diarization and Transcription with the [EGO4D Audio Visual Diarization Benchmark](https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md). In order to access the dataset you will need to review the liscense agreement and accept the terms for [EGO4D](https://ego4d-data.org/docs/start-here/).
+

From 76727c73b43af370075cd3ace9525871d6e40533 Mon Sep 17 00:00:00 2001
From: Ashneet <ashneetnagi21@gmail.com>
Date: Tue, 19 Dec 2023 16:57:43 -0700
Subject: [PATCH 3/9] Updated README.md

---
 extensions/labgraph_diarization/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/extensions/labgraph_diarization/README.md b/extensions/labgraph_diarization/README.md
index e437e011..58c7d7f8 100644
--- a/extensions/labgraph_diarization/README.md
+++ b/extensions/labgraph_diarization/README.md
@@ -1,5 +1,6 @@
-#EGO4D Audio-Visual Diarization Benchmark Colab
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UBGYBNrjJckCb0KoyQ6xXea79kuoLgu6#scrollTo=lhUO_Ov678bm]
+# EGO4D Audio-Visual Diarization Benchmark Colab
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UBGYBNrjJckCb0KoyQ6xXea79kuoLgu6#scrollTo=lhUO_Ov678bm)
 
 The Colab downloads a subset of video clips from the EGO4D Dataset and performs Audio-Visual Diarization and Transcription with the [EGO4D Audio Visual Diarization Benchmark](https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md). In order to access the dataset you will need to review the liscense agreement and accept the terms for [EGO4D](https://ego4d-data.org/docs/start-here/).
 

From 099af2f3782ffc5458188d1f85f4eb2e3fff7ca9 Mon Sep 17 00:00:00 2001
From: Ashneet <ashneetnagi21@gmail.com>
Date: Tue, 19 Dec 2023 17:01:26 -0700
Subject: [PATCH 4/9] Changed description of Ego4d_AudioVisual.ipynb

---
 .../Ego4D_AudioVisual.ipynb                   | 142 +++++++++---------
 1 file changed, 67 insertions(+), 75 deletions(-)

diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
index 4f40a145..31ee14da 100644
--- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
+++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
@@ -7,20 +7,20 @@
       },
       "source": [
         "**EGO4D Audio-Visual Diarization Benchmark**\n",
-        "- This notebook allows a quickstart into the EGO4D Audio Visual Repo\n",
+        "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n",
         "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n",
         "- Hardware accelerator could be T4GPU, V100 or A100 GPU\n",
-        "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab, the changes are commented out and are reversible"
+        "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab"
       ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "##Install Dependencies"
-      ],
       "metadata": {
         "id": "hIBjA7mUpK4F"
-      }
+      },
+      "source": [
+        "##Install Dependencies"
+      ]
     },
     {
       "cell_type": "code",
@@ -56,12 +56,12 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "##Clone Repository & download the videos"
-      ],
       "metadata": {
         "id": "fbt09eafpoET"
-      }
+      },
+      "source": [
+        "##Clone Repository & download the videos"
+      ]
     },
     {
       "cell_type": "code",
@@ -106,21 +106,21 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "#EGO4D Audio-Visual Diarization Baseline"
-      ],
       "metadata": {
         "id": "Mlbjm9kPvqLr"
-      }
+      },
+      "source": [
+        "#EGO4D Audio-Visual Diarization Baseline"
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "##Preprocess ground truth data"
-      ],
       "metadata": {
         "id": "d4opZblNpyHC"
-      }
+      },
+      "source": [
+        "##Preprocess ground truth data"
+      ]
     },
     {
       "cell_type": "code",
@@ -143,21 +143,13 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "jt7yzV648gOr",
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
+        "id": "jt7yzV648gOr",
         "outputId": "acf51bc2-fb54-4769-c20a-5e7d1b53d25f"
       },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "/content/egocentric/audio-visual/utils/ground_truth\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "#Run visualize_ground_truth.py (It downloads the output video to the current directory)\n",
         "%cd /content/egocentric/audio-visual/utils/ground_truth/\n",
@@ -166,21 +158,21 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "##Localization & Tracking"
-      ],
       "metadata": {
         "id": "c2fNByhuqTHn"
-      }
+      },
+      "source": [
+        "##Localization & Tracking"
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "###People Detection Setup"
-      ],
       "metadata": {
         "id": "ZPQT5dMUqZjo"
-      }
+      },
+      "source": [
+        "###People Detection Setup"
+      ]
     },
     {
       "cell_type": "code",
@@ -213,12 +205,12 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "###Short Term Tracking"
-      ],
       "metadata": {
         "id": "vqidCROwqfJA"
-      }
+      },
+      "source": [
+        "###Short Term Tracking"
+      ]
     },
     {
       "cell_type": "code",
@@ -247,12 +239,12 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "###Run Global People Tracking"
-      ],
       "metadata": {
         "id": "jIfxzYDgqlkf"
-      }
+      },
+      "source": [
+        "###Run Global People Tracking"
+      ]
     },
     {
       "cell_type": "code",
@@ -269,12 +261,12 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "##Voice Activity Detection (VAD)"
-      ],
       "metadata": {
         "id": "uG8NV_oUrDZY"
-      }
+      },
+      "source": [
+        "##Voice Activity Detection (VAD)"
+      ]
     },
     {
       "cell_type": "code",
@@ -292,21 +284,21 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "##Active Speaker Detection (ASD)"
-      ],
       "metadata": {
         "id": "rVr2OzKIrRBY"
-      }
+      },
+      "source": [
+        "##Active Speaker Detection (ASD)"
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "####Mouth Region Classification (MRC)"
-      ],
       "metadata": {
         "id": "2BN1eTm_rePW"
-      }
+      },
+      "source": [
+        "####Mouth Region Classification (MRC)"
+      ]
     },
     {
       "cell_type": "code",
@@ -350,12 +342,12 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "##Audio Embedding"
-      ],
       "metadata": {
         "id": "PY4Mha_QrjV-"
-      }
+      },
+      "source": [
+        "##Audio Embedding"
+      ]
     },
     {
       "cell_type": "code",
@@ -372,21 +364,21 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "##Device wearer voice activity detection"
-      ],
       "metadata": {
         "id": "XXUYRiQ1rueJ"
-      }
+      },
+      "source": [
+        "##Device wearer voice activity detection"
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "####Energy Based Method"
-      ],
       "metadata": {
         "id": "kssnc4VNr4ra"
-      }
+      },
+      "source": [
+        "####Energy Based Method"
+      ]
     },
     {
       "cell_type": "code",
@@ -404,12 +396,12 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "##Surrounding people voice matching (MRC)"
-      ],
       "metadata": {
         "id": "pQnNzUoasJjx"
-      }
+      },
+      "source": [
+        "##Surrounding people voice matching (MRC)"
+      ]
     },
     {
       "cell_type": "code",
@@ -521,8 +513,8 @@
   "metadata": {
     "accelerator": "GPU",
     "colab": {
-      "provenance": [],
-      "gpuType": "V100"
+      "gpuType": "V100",
+      "provenance": []
     },
     "kernelspec": {
       "display_name": "Python 3",
@@ -534,4 +526,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}

From bf66b1c1cd33081666750d5f74ca1202c66755ba Mon Sep 17 00:00:00 2001
From: Ashneet <ashneetnagi21@gmail.com>
Date: Tue, 19 Dec 2023 23:25:54 -0700
Subject: [PATCH 5/9] Update Ego4D_AudioVisual.ipynb

- Updated description
- Moved location of a cell
---
 .../Ego4D_AudioVisual.ipynb                   | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
index 31ee14da..1e9ea238 100644
--- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
+++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
@@ -9,7 +9,7 @@
         "**EGO4D Audio-Visual Diarization Benchmark**\n",
         "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n",
         "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n",
-        "- Hardware accelerator could be T4GPU, V100 or A100 GPU\n",
+        "- Hardware accelerator should be T4GPU\n",
         "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab"
       ]
     },
@@ -39,21 +39,6 @@
         "!pip install pydub audiosegment"
       ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "qssel_-R8d-d"
-      },
-      "outputs": [],
-      "source": [
-        "# Install libtorch\n",
-        "%cd /content/egocentric\n",
-        "!wget \"https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu118.zip\"\n",
-        "!unzip \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"\n",
-        "!rm -rf \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\""
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -104,6 +89,19 @@
         "!mv data/v2/annotations/* utils/ground_truth"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Install libtorch\n",
+        "%cd /content/egocentric\n",
+        "!wget \"https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu118.zip\"\n",
+        "!unzip \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"\n",
+        "!rm -rf \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\""
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {

From ef7feed3bbe2c67ba96e66f73fa62c5845920386 Mon Sep 17 00:00:00 2001
From: Ashneet <ashneetnagi21@gmail.com>
Date: Tue, 19 Dec 2023 23:31:12 -0700
Subject: [PATCH 6/9] Update Ego4D_AudioVisual.ipynb

- Specified archetype in the People detection set up cell
---
 extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
index 1e9ea238..640ff7bf 100644
--- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
+++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
@@ -188,6 +188,9 @@
         "!sed -i '44s/.*/LDFLAGS+= `pkg-config --libs opencv4` -lstdc++/' Makefile\n",
         "!sed -i '45s/.*/COMMON+= `pkg-config --cflags opencv4`/' Makefile\n",
         "\n",
+        "#Specifying the arch\n",
+        "!sed -i '14s/.*/ARCH= -gencode arch=compute_75,code=sm_75/' Makefile\n",
+        "\n",
         "# Add missing headers required to build using opencv4\n",
         "# https://stackoverflow.com/questions/64885148/error-iplimage-does-not-name-a-type-when-trying-to-build-darknet-with-opencv\n",
         "!sed -i '3 i #include \"opencv2/core/core_c.h\"' src/image_opencv.cpp\n",

From a8026fdc8913f06c7c21edffb6b727947b71f09b Mon Sep 17 00:00:00 2001
From: Ashneet <96892013+ashneet1@users.noreply.github.com>
Date: Tue, 19 Dec 2023 23:37:35 -0700
Subject: [PATCH 7/9] Update Ego4D_AudioVisual.ipynb

---
 extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
index 640ff7bf..d0191245 100644
--- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
+++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
@@ -9,7 +9,7 @@
         "**EGO4D Audio-Visual Diarization Benchmark**\n",
         "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n",
         "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n",
-        "- Hardware accelerator should be T4GPU\n",
+        "- Hardware accelerator should be T4G PU\n",
         "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab"
       ]
     },

From d1172da071247684bc62a7463d5b69fdcde50c93 Mon Sep 17 00:00:00 2001
From: Ashneet <96892013+ashneet1@users.noreply.github.com>
Date: Tue, 19 Dec 2023 23:37:59 -0700
Subject: [PATCH 8/9] Update Ego4D_AudioVisual.ipynb

---
 extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
index d0191245..9359d911 100644
--- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
+++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
@@ -9,7 +9,7 @@
         "**EGO4D Audio-Visual Diarization Benchmark**\n",
         "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n",
         "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n",
-        "- Hardware accelerator should be T4G PU\n",
+        "- Hardware accelerator should be T4 GPU\n",
         "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab"
       ]
     },

From c4662e905a9584c5f24866b14cf69aecfecdd1cb Mon Sep 17 00:00:00 2001
From: Ashneet <ashneetnagi21@gmail.com>
Date: Wed, 20 Dec 2023 16:01:16 -0700
Subject: [PATCH 9/9] Changed description of colab

---
 extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
index 640ff7bf..1e55487a 100644
--- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
+++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb
@@ -7,7 +7,7 @@
       },
       "source": [
         "**EGO4D Audio-Visual Diarization Benchmark**\n",
-        "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n",
+        "- This notebook allows a quickstart into the [EGO4D Audio Visual Diarization](https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md) and [Transcription](https://github.com/EGO4D/audio-visual/blob/main/transcription/README.md) from the [EGO4D Audio Visual Diarization Benchmark](https://github.com/EGO4D/audio-visual)\n",
         "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n",
         "- Hardware accelerator should be T4GPU\n",
         "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab"