From 8749b41af13f3d719016b80ef2a44b9215c3621d Mon Sep 17 00:00:00 2001 From: Ashneet Date: Tue, 19 Dec 2023 16:28:22 -0700 Subject: [PATCH 1/9] Added the Colab --- .../Ego4D_AudioVisual.ipynb | 537 ++++++++++++++++++ 1 file changed, 537 insertions(+) create mode 100644 extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb new file mode 100644 index 00000000..4f40a145 --- /dev/null +++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb @@ -0,0 +1,537 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ad3Kas-mn3Ke" + }, + "source": [ + "**EGO4D Audio-Visual Diarization Benchmark**\n", + "- This notebook allows a quickstart into the EGO4D Audio Visual Repo\n", + "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n", + "- Hardware accelerator could be T4GPU, V100 or A100 GPU\n", + "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab, the changes are commented out and are reversible" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Install Dependencies" + ], + "metadata": { + "id": "hIBjA7mUpK4F" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lhUO_Ov678bm" + }, + "outputs": [], + "source": [ + "!apt install ffmpeg python3-pip git\n", + "!pip install ego4d awscli numpy opencv-python pyqt5 opencv-contrib-python libtorch torchvision torchaudio\n", + "!sudo apt-get install libavcodec-dev libavformat-dev libswscale-dev libv4l-dev\n", + "!sudo apt-get install libxvidcore-dev libx264-dev\n", + "!sudo apt install libgtk2.0-dev liblcm-dev\n", + "!sudo apt-get install liblcm-dev\n", + "!pip install pydub audiosegment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qssel_-R8d-d" + }, + "outputs": [], + "source": [ + "# Install libtorch\n", + "%cd /content/egocentric\n", + "!wget \"https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu118.zip\"\n", + "!unzip \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"\n", + "!rm -rf \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Clone Repository & download the videos" + ], + "metadata": { + "id": "fbt09eafpoET" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "o77Hf1uJ8Fei" + }, + "outputs": [], + "source": [ + "# Create the new egocentric directory\n", + "!mkdir egocentric\n", + "%cd /content/egocentric\n", + "\n", + "# Clone the audio-visualrepo\n", + "!git clone https://github.com/ashneet1/audio-visual.git\n", + "%cd /content/egocentric/audio-visual\n", + "!mkdir data\n", + "\n", + "# List the video uids to download\n", + "!touch video_uids.txt\n", + "!echo \"0b4cacb1-970f-4ef0-85da-371d81f899e0\" >> video_uids.txt\n", + "!echo \"c2413391-7c1b-4fd6-8b1d-98ee7888b9f8\" >> video_uids.txt\n", + "!echo \"fe69a78e-7773-45d1-9e0f-bacee52dac83\" >> video_uids.txt\n", + "!echo \"3b79017c-4d42-40fc-a1bb-4a20bc8ebca7\" >> video_uids.txt\n", + "!echo \"6dbfc053-7899-40d8-9827-0ccd21f3ee0a\" >> video_uids.txt\n", + "!echo \"7e6dfd31-8544-4fad-9e49-0f05516cf8cf\" >> video_uids.txt\n", + "!echo \"56c5af79-f9d4-478d-96ef-6d71e0bbbdfe\" >> video_uids.txt\n", + "!echo \"d97bedc8-72df-43be-a55b-4da1ae42dfd1\" >> video_uids.txt\n", + "!echo \"f0cb79ef-c081-4049-85ef-2623e02c9589\" >> video_uids.txt\n", + "!echo \"08b0935e-6260-4bd6-86ca-f6fc54e388be\" >> video_uids.txt\n", + "!echo \"6b34c327-000c-42b6-b242-d3dca63a7508\" >> video_uids.txt\n", + "!echo \"076bdb81-5c75-4282-9f3a-a387624575f3\" >> video_uids.txt\n", + "\n", + "# Configure aws cli to be able to access the ego4d dataset\n", + "!aws configure\n", + "\n", + "# Download ego4d model, annotation, and videos\n", + "!ego4d -y --output_directory ./data --datasets av_models clips annotations --benchmarks av --video_uid_file video_uids.txt\n", + "!tar xf data/v2/av_models/pretrained_av_models.tar.gz\n", + "!mv data/v2/annotations/* utils/ground_truth" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#EGO4D Audio-Visual Diarization Baseline" + ], + "metadata": { + "id": "Mlbjm9kPvqLr" + } + }, + { + "cell_type": "markdown", + "source": [ + "##Preprocess ground truth data" + ], + "metadata": { + "id": "d4opZblNpyHC" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IM6020Xg8aSF" + }, + "outputs": [], + "source": [ + "# Preprocess ground truth data\n", + "%cd /content/egocentric/audio-visual/utils/ground_truth\n", + "!bash init_dirs.sh\n", + "!python3 extract_clipnames_and_split_indices.py\n", + "!python3 extract_boxes_and_speakers.py\n", + "!python3 make_mot_ground_truth.py ../../data/v2/clips val\n", + "!mv tracking_evaluation/mot_challenge ../../tracking/tracking_evaluation/data/gt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jt7yzV648gOr", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "acf51bc2-fb54-4769-c20a-5e7d1b53d25f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/egocentric/audio-visual/utils/ground_truth\n" + ] + } + ], + "source": [ + "#Run visualize_ground_truth.py (It downloads the output video to the current directory)\n", + "%cd /content/egocentric/audio-visual/utils/ground_truth/\n", + "!python3 visualize_ground_truth.py /content/egocentric/audio-visual/data/v2/clips 0b4cacb1-970f-4ef0-85da-371d81f899e0 #This is 389" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Localization & Tracking" + ], + "metadata": { + "id": "c2fNByhuqTHn" + } + }, + { + "cell_type": "markdown", + "source": [ + "###People Detection Setup" + ], + "metadata": { + "id": "ZPQT5dMUqZjo" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZqOp5PWA8nNR" + }, + "outputs": [], + "source": [ + "# People detection setup\n", + "# https://github.com/EGO4D/audio-visual/blob/main/tracking/README.md#people-detection\n", + "%cd /content/egocentric/audio-visual/tracking/people_detection\n", + "\n", + "# Replace the lines in the makefile to use opencv4 instead of opencv3\n", + "!sed -i '44s/.*/LDFLAGS+= `pkg-config --libs opencv4` -lstdc++/' Makefile\n", + "!sed -i '45s/.*/COMMON+= `pkg-config --cflags opencv4`/' Makefile\n", + "\n", + "# Add missing headers required to build using opencv4\n", + "# https://stackoverflow.com/questions/64885148/error-iplimage-does-not-name-a-type-when-trying-to-build-darknet-with-opencv\n", + "!sed -i '3 i #include \"opencv2/core/core_c.h\"' src/image_opencv.cpp\n", + "!sed -i '3 i #include \"opencv2/videoio/legacy/constants_c.h\"' src/image_opencv.cpp\n", + "!sed -i '3 i #include \"opencv2/highgui/highgui_c.h\"' src/image_opencv.cpp\n", + "\n", + "!sed -i '3 i #include \"opencv2/core/core_c.h\"' src/image_opencv.hpp\n", + "!sed -i '3 i #include \"opencv2/videoio/legacy/constants_c.h\"' src/image_opencv.hpp\n", + "!sed -i '3 i #include \"opencv2/highgui/highgui_c.h\"' src/image_opencv.hpp\n", + "\n", + "!make -j" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Short Term Tracking" + ], + "metadata": { + "id": "vqidCROwqfJA" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2gpLUYd68vE2" + }, + "outputs": [], + "source": [ + "# Short term tracking setup\n", + "# https://github.com/EGO4D/audio-visual/blob/main/tracking/README.md#short_term_tracking\n", + "%cd /content/egocentric/audio-visual/tracking/short_term_tracking\n", + "\n", + "# Modify line 13 in the CMake file to include the opencv4 directory\n", + "# https://stackoverflow.com/questions/58478074/how-to-fix-fatal-error-opencv2-core-hpp-no-such-file-or-directory-for-opencv\n", + "!sed -i '13s,.*,include_directories( /usr/local/include /usr/local/cuda/include /usr/include/opencv4/ ),' CMakeLists.txt\n", + "\n", + "# Modify line 17 in the CMake file to fix a compilation error\n", + "# https://github.com/pytorch/pytorch/issues/103371\n", + "!sed -i '17s,.*,set_property(TARGET short_term_tracker PROPERTY CXX_STANDARD 17),' CMakeLists.txt\n", + "!mkdir build\n", + "%cd build\n", + "!cmake -DCMAKE_PREFIX_PATH=/content/egocentric/libtorch ..\n", + "!make" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Run Global People Tracking" + ], + "metadata": { + "id": "jIfxzYDgqlkf" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D-a-ez1A85EB" + }, + "outputs": [], + "source": [ + "#Global People Tracking\n", + "%cd /content/egocentric/audio-visual/tracking\n", + "!python3 single_run.py /content/egocentric/audio-visual/data/v2/clips 438" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Voice Activity Detection (VAD)" + ], + "metadata": { + "id": "uG8NV_oUrDZY" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SxlxvS0Y81BU" + }, + "outputs": [], + "source": [ + "#Voice Activity Audio Detection\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/vad\n", + "!python3 extract_all_audio.py /content/egocentric/audio-visual/data/v2/clips\n", + "!python3 vad.py" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Active Speaker Detection (ASD)" + ], + "metadata": { + "id": "rVr2OzKIrRBY" + } + }, + { + "cell_type": "markdown", + "source": [ + "####Mouth Region Classification (MRC)" + ], + "metadata": { + "id": "2BN1eTm_rePW" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ipOe-4y1883Q" + }, + "outputs": [], + "source": [ + "# Active Speaker Detection (ASD)\n", + "# Mouth region classification (MRC)\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/active_speaker/mrc_active_speaker_detection/prediction\n", + "\n", + "# Modify line 15 and 17 in the CMake file to include the opencv4 directory\n", + "# https://stackoverflow.com/questions/58478074/how-to-fix-fatal-error-opencv2-core-hpp-no-such-file-or-directory-for-opencv\n", + "!sed -i '15s,.*,link_directories( /usr/local/lib /usr/local/cuda/lib64 /usr/include/opencv4/ ),' CMakeLists.txt\n", + "!sed -i '17s,.*,include_directories( /usr/local/include /usr/local/cuda/include /usr/local/cuda/targets/x86_64-linux/include /usr/include/opencv4/ ),' CMakeLists.txt\n", + "\n", + "# Modify line 21 in the CMake file to fix a compilation error\n", + "# https://github.com/pytorch/pytorch/issues/103371\n", + "!sed -i '21s,.*,set_property(TARGET mrc PROPERTY CXX_STANDARD 17),' CMakeLists.txt\n", + "# Build MRC tracking code\n", + "!mkdir build\n", + "%cd build\n", + "!cmake -DCMAKE_PREFIX_PATH=/content/egocentric/libtorch ..\n", + "!make" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FcFzpREn8_1P" + }, + "outputs": [], + "source": [ + "#Running the MRC\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/active_speaker/mrc_active_speaker_detection/prediction\n", + "!python3 run_once.py /content/egocentric/audio-visual/data/v2/clips ego4d 389" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Audio Embedding" + ], + "metadata": { + "id": "PY4Mha_QrjV-" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_OzROruu9CVO" + }, + "outputs": [], + "source": [ + "#Voice Embedding\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/audio_embedding/make_audio_embeddings\n", + "!python3 batch_audio_embedding.py /content/egocentric/audio-visual/data/v2/clips val" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Device wearer voice activity detection" + ], + "metadata": { + "id": "XXUYRiQ1rueJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "####Energy Based Method" + ], + "metadata": { + "id": "kssnc4VNr4ra" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LMZ3jbsM9E5c" + }, + "outputs": [], + "source": [ + "#Wearer: energy based method\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/wearer/energy_based\n", + "!python3 short_time_energy.py /content/egocentric/audio-visual/data/v2/clips val\n", + "!python3 match_wearer_audio.py val" + ] + }, + { + "cell_type": "markdown", + "source": [ + "##Surrounding people voice matching (MRC)" + ], + "metadata": { + "id": "pQnNzUoasJjx" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wOTCIPGi9Mpn" + }, + "outputs": [], + "source": [ + "#Surrounding People Audio Matching: MRC\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/surrounding_people_audio_matching/mrc\n", + "!python3 match_audio.py /content/egocentric/audio-visual/active-speaker-detection/active_speaker/mrc_active_speaker_detection/prediction/results val" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7rVXK-W5b_Nu" + }, + "source": [ + "#Transcription\n", + "- Need to move \"av_test_unannotated.json\",\"av_train.json\", and \"av_val.json\" from /content/egocentric/audio-visual/utils/ground_truth to /content/egocentric/audio-visual/data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gWtnMbJrbybQ" + }, + "outputs": [], + "source": [ + "#Install Miniconda\n", + "%cd /content/\n", + "#https://www.kaggle.com/code/alaajah/creating-virtual-environment-on-google-colab\n", + "!wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n", + "!chmod +x Miniconda3-latest-Linux-x86_64.sh\n", + "!./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local\n", + "!conda install -q -y --prefix /usr/local python=3.8.10 ujson" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IvLBi0e_cEq8" + }, + "outputs": [], + "source": [ + "#Install Sclite\n", + "%cd /content/\n", + "!git clone https://github.com/usnistgov/SCTK.git\n", + "%cd SCTK\n", + "! make config\n", + "! make all\n", + "! make check\n", + "! make install\n", + "! make doc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "To36j1F_cLIC" + }, + "outputs": [], + "source": [ + "#Activate Environment\n", + "%cd /content/drive/MyDrive/egocentric/audio-visual/transcription\n", + "!conda create --name transcription_env --file requirements_38_10.txt\n", + "!pip install soundfile\n", + "!pip install torch\n", + "!pip install espnet_model_zoo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d7do0eXVcQRr" + }, + "outputs": [], + "source": [ + "#Extract 16kHz single channel audio files in wav format from videos\n", + "%cd /content/egocentric/audio-visual/data/v2/clips\n", + "!mkdir wavs_16000\n", + "%cd /content/egocentric/audio-visual/transcription\n", + "!chmod +x extract_wav.sh\n", + "!./extract_wav.sh /content/egocentric/audio-visual/data/v2/clips /content/egocentric/audio-visual/data/v2/wavs_16000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "okmVTCt4cadk" + }, + "outputs": [], + "source": [ + "#Extract transcriptions from the annotation files, decode audio and score the decoding output\n", + "%cd /content/egocentric/audio-visual/transcription\n", + "!pip install torchaudio\n", + "!chmod +x score_asr.sh\n", + "!./score_asr.sh /content/egocentric/audio-visual/transcription/output 1" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [], + "gpuType": "V100" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 23da5894d50b9d350579ac19db06e4769b93690f Mon Sep 17 00:00:00 2001 From: Ashneet Date: Tue, 19 Dec 2023 16:51:11 -0700 Subject: [PATCH 2/9] Create README.md --- extensions/labgraph_diarization/README.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 extensions/labgraph_diarization/README.md diff --git a/extensions/labgraph_diarization/README.md b/extensions/labgraph_diarization/README.md new file mode 100644 index 00000000..e437e011 --- /dev/null +++ b/extensions/labgraph_diarization/README.md @@ -0,0 +1,5 @@ +#EGO4D Audio-Visual Diarization Benchmark Colab +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UBGYBNrjJckCb0KoyQ6xXea79kuoLgu6#scrollTo=lhUO_Ov678bm] + +The Colab downloads a subset of video clips from the EGO4D Dataset and performs Audio-Visual Diarization and Transcription with the [EGO4D Audio Visual Diarization Benchmark](https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md). In order to access the dataset you will need to review the liscense agreement and accept the terms for [EGO4D](https://ego4d-data.org/docs/start-here/). + From 76727c73b43af370075cd3ace9525871d6e40533 Mon Sep 17 00:00:00 2001 From: Ashneet Date: Tue, 19 Dec 2023 16:57:43 -0700 Subject: [PATCH 3/9] Updated README.md --- extensions/labgraph_diarization/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/extensions/labgraph_diarization/README.md b/extensions/labgraph_diarization/README.md index e437e011..58c7d7f8 100644 --- a/extensions/labgraph_diarization/README.md +++ b/extensions/labgraph_diarization/README.md @@ -1,5 +1,6 @@ -#EGO4D Audio-Visual Diarization Benchmark Colab -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UBGYBNrjJckCb0KoyQ6xXea79kuoLgu6#scrollTo=lhUO_Ov678bm] +# EGO4D Audio-Visual Diarization Benchmark Colab + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UBGYBNrjJckCb0KoyQ6xXea79kuoLgu6#scrollTo=lhUO_Ov678bm) The Colab downloads a subset of video clips from the EGO4D Dataset and performs Audio-Visual Diarization and Transcription with the [EGO4D Audio Visual Diarization Benchmark](https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md). In order to access the dataset you will need to review the liscense agreement and accept the terms for [EGO4D](https://ego4d-data.org/docs/start-here/). From 099af2f3782ffc5458188d1f85f4eb2e3fff7ca9 Mon Sep 17 00:00:00 2001 From: Ashneet Date: Tue, 19 Dec 2023 17:01:26 -0700 Subject: [PATCH 4/9] Changed description of Ego4d_AudioVisual.ipynb --- .../Ego4D_AudioVisual.ipynb | 142 +++++++++--------- 1 file changed, 67 insertions(+), 75 deletions(-) diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb index 4f40a145..31ee14da 100644 --- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb +++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb @@ -7,20 +7,20 @@ }, "source": [ "**EGO4D Audio-Visual Diarization Benchmark**\n", - "- This notebook allows a quickstart into the EGO4D Audio Visual Repo\n", + "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n", "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n", "- Hardware accelerator could be T4GPU, V100 or A100 GPU\n", - "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab, the changes are commented out and are reversible" + "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab" ] }, { "cell_type": "markdown", - "source": [ - "##Install Dependencies" - ], "metadata": { "id": "hIBjA7mUpK4F" - } + }, + "source": [ + "##Install Dependencies" + ] }, { "cell_type": "code", @@ -56,12 +56,12 @@ }, { "cell_type": "markdown", - "source": [ - "##Clone Repository & download the videos" - ], "metadata": { "id": "fbt09eafpoET" - } + }, + "source": [ + "##Clone Repository & download the videos" + ] }, { "cell_type": "code", @@ -106,21 +106,21 @@ }, { "cell_type": "markdown", - "source": [ - "#EGO4D Audio-Visual Diarization Baseline" - ], "metadata": { "id": "Mlbjm9kPvqLr" - } + }, + "source": [ + "#EGO4D Audio-Visual Diarization Baseline" + ] }, { "cell_type": "markdown", - "source": [ - "##Preprocess ground truth data" - ], "metadata": { "id": "d4opZblNpyHC" - } + }, + "source": [ + "##Preprocess ground truth data" + ] }, { "cell_type": "code", @@ -143,21 +143,13 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "jt7yzV648gOr", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "jt7yzV648gOr", "outputId": "acf51bc2-fb54-4769-c20a-5e7d1b53d25f" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "/content/egocentric/audio-visual/utils/ground_truth\n" - ] - } - ], + "outputs": [], "source": [ "#Run visualize_ground_truth.py (It downloads the output video to the current directory)\n", "%cd /content/egocentric/audio-visual/utils/ground_truth/\n", @@ -166,21 +158,21 @@ }, { "cell_type": "markdown", - "source": [ - "##Localization & Tracking" - ], "metadata": { "id": "c2fNByhuqTHn" - } + }, + "source": [ + "##Localization & Tracking" + ] }, { "cell_type": "markdown", - "source": [ - "###People Detection Setup" - ], "metadata": { "id": "ZPQT5dMUqZjo" - } + }, + "source": [ + "###People Detection Setup" + ] }, { "cell_type": "code", @@ -213,12 +205,12 @@ }, { "cell_type": "markdown", - "source": [ - "###Short Term Tracking" - ], "metadata": { "id": "vqidCROwqfJA" - } + }, + "source": [ + "###Short Term Tracking" + ] }, { "cell_type": "code", @@ -247,12 +239,12 @@ }, { "cell_type": "markdown", - "source": [ - "###Run Global People Tracking" - ], "metadata": { "id": "jIfxzYDgqlkf" - } + }, + "source": [ + "###Run Global People Tracking" + ] }, { "cell_type": "code", @@ -269,12 +261,12 @@ }, { "cell_type": "markdown", - "source": [ - "##Voice Activity Detection (VAD)" - ], "metadata": { "id": "uG8NV_oUrDZY" - } + }, + "source": [ + "##Voice Activity Detection (VAD)" + ] }, { "cell_type": "code", @@ -292,21 +284,21 @@ }, { "cell_type": "markdown", - "source": [ - "##Active Speaker Detection (ASD)" - ], "metadata": { "id": "rVr2OzKIrRBY" - } + }, + "source": [ + "##Active Speaker Detection (ASD)" + ] }, { "cell_type": "markdown", - "source": [ - "####Mouth Region Classification (MRC)" - ], "metadata": { "id": "2BN1eTm_rePW" - } + }, + "source": [ + "####Mouth Region Classification (MRC)" + ] }, { "cell_type": "code", @@ -350,12 +342,12 @@ }, { "cell_type": "markdown", - "source": [ - "##Audio Embedding" - ], "metadata": { "id": "PY4Mha_QrjV-" - } + }, + "source": [ + "##Audio Embedding" + ] }, { "cell_type": "code", @@ -372,21 +364,21 @@ }, { "cell_type": "markdown", - "source": [ - "##Device wearer voice activity detection" - ], "metadata": { "id": "XXUYRiQ1rueJ" - } + }, + "source": [ + "##Device wearer voice activity detection" + ] }, { "cell_type": "markdown", - "source": [ - "####Energy Based Method" - ], "metadata": { "id": "kssnc4VNr4ra" - } + }, + "source": [ + "####Energy Based Method" + ] }, { "cell_type": "code", @@ -404,12 +396,12 @@ }, { "cell_type": "markdown", - "source": [ - "##Surrounding people voice matching (MRC)" - ], "metadata": { "id": "pQnNzUoasJjx" - } + }, + "source": [ + "##Surrounding people voice matching (MRC)" + ] }, { "cell_type": "code", @@ -521,8 +513,8 @@ "metadata": { "accelerator": "GPU", "colab": { - "provenance": [], - "gpuType": "V100" + "gpuType": "V100", + "provenance": [] }, "kernelspec": { "display_name": "Python 3", @@ -534,4 +526,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From bf66b1c1cd33081666750d5f74ca1202c66755ba Mon Sep 17 00:00:00 2001 From: Ashneet Date: Tue, 19 Dec 2023 23:25:54 -0700 Subject: [PATCH 5/9] Update Ego4D_AudioVisual.ipynb - Updated description - Moved location of a cell --- .../Ego4D_AudioVisual.ipynb | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb index 31ee14da..1e9ea238 100644 --- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb +++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb @@ -9,7 +9,7 @@ "**EGO4D Audio-Visual Diarization Benchmark**\n", "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n", "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n", - "- Hardware accelerator could be T4GPU, V100 or A100 GPU\n", + "- Hardware accelerator should be T4GPU\n", "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab" ] }, @@ -39,21 +39,6 @@ "!pip install pydub audiosegment" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qssel_-R8d-d" - }, - "outputs": [], - "source": [ - "# Install libtorch\n", - "%cd /content/egocentric\n", - "!wget \"https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu118.zip\"\n", - "!unzip \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"\n", - "!rm -rf \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"" - ] - }, { "cell_type": "markdown", "metadata": { @@ -104,6 +89,19 @@ "!mv data/v2/annotations/* utils/ground_truth" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install libtorch\n", + "%cd /content/egocentric\n", + "!wget \"https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu118.zip\"\n", + "!unzip \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"\n", + "!rm -rf \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"" + ] + }, { "cell_type": "markdown", "metadata": { From ef7feed3bbe2c67ba96e66f73fa62c5845920386 Mon Sep 17 00:00:00 2001 From: Ashneet Date: Tue, 19 Dec 2023 23:31:12 -0700 Subject: [PATCH 6/9] Update Ego4D_AudioVisual.ipynb - Specified archetype in the People detection set up cell --- extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb index 1e9ea238..640ff7bf 100644 --- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb +++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb @@ -188,6 +188,9 @@ "!sed -i '44s/.*/LDFLAGS+= `pkg-config --libs opencv4` -lstdc++/' Makefile\n", "!sed -i '45s/.*/COMMON+= `pkg-config --cflags opencv4`/' Makefile\n", "\n", + "#Specifying the arch\n", + "!sed -i '14s/.*/ARCH= -gencode arch=compute_75,code=sm_75/' Makefile\n", + "\n", "# Add missing headers required to build using opencv4\n", "# https://stackoverflow.com/questions/64885148/error-iplimage-does-not-name-a-type-when-trying-to-build-darknet-with-opencv\n", "!sed -i '3 i #include \"opencv2/core/core_c.h\"' src/image_opencv.cpp\n", From a8026fdc8913f06c7c21edffb6b727947b71f09b Mon Sep 17 00:00:00 2001 From: Ashneet <96892013+ashneet1@users.noreply.github.com> Date: Tue, 19 Dec 2023 23:37:35 -0700 Subject: [PATCH 7/9] Update Ego4D_AudioVisual.ipynb --- extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb index 640ff7bf..d0191245 100644 --- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb +++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb @@ -9,7 +9,7 @@ "**EGO4D Audio-Visual Diarization Benchmark**\n", "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n", "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n", - "- Hardware accelerator should be T4GPU\n", + "- Hardware accelerator should be T4G PU\n", "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab" ] }, From d1172da071247684bc62a7463d5b69fdcde50c93 Mon Sep 17 00:00:00 2001 From: Ashneet <96892013+ashneet1@users.noreply.github.com> Date: Tue, 19 Dec 2023 23:37:59 -0700 Subject: [PATCH 8/9] Update Ego4D_AudioVisual.ipynb --- extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb index d0191245..9359d911 100644 --- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb +++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb @@ -9,7 +9,7 @@ "**EGO4D Audio-Visual Diarization Benchmark**\n", "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n", "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n", - "- Hardware accelerator should be T4G PU\n", + "- Hardware accelerator should be T4 GPU\n", "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab" ] }, From c4662e905a9584c5f24866b14cf69aecfecdd1cb Mon Sep 17 00:00:00 2001 From: Ashneet Date: Wed, 20 Dec 2023 16:01:16 -0700 Subject: [PATCH 9/9] Changed description of colab --- extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb index 640ff7bf..1e55487a 100644 --- a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb +++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb @@ -7,7 +7,7 @@ }, "source": [ "**EGO4D Audio-Visual Diarization Benchmark**\n", - "- This notebook allows a quickstart into the EGO4D Audio Visual Repo, https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md\n", + "- This notebook allows a quickstart into the [EGO4D Audio Visual Diarization](https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md) and [Transcription](https://github.com/EGO4D/audio-visual/blob/main/transcription/README.md) from the [EGO4D Audio Visual Diarization Benchmark](https://github.com/EGO4D/audio-visual)\n", "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n", "- Hardware accelerator should be T4GPU\n", "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab"