diff --git a/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb new file mode 100644 index 00000000..a5e1c82d --- /dev/null +++ b/extensions/labgraph_diarization/Ego4D_AudioVisual.ipynb @@ -0,0 +1,530 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ad3Kas-mn3Ke" + }, + "source": [ + "**EGO4D Audio-Visual Diarization Benchmark**\n", + "- This notebook allows a quickstart into the [EGO4D Audio Visual Diarization](https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md) and [Transcription](https://github.com/EGO4D/audio-visual/blob/main/transcription/README.md) from the [EGO4D Audio Visual Diarization Benchmark](https://github.com/EGO4D/audio-visual)\n", + "- It runs a subset of video clips from the EGO4D dataset in EGO4D's Audio-Visual repo\n", + "- Hardware accelerator should be T4 GPU\n", + "- Some changes to the code have been made in the forked repo so that it could be compatible with Google Colab" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hIBjA7mUpK4F" + }, + "source": [ + "##Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lhUO_Ov678bm" + }, + "outputs": [], + "source": [ + "!apt install ffmpeg python3-pip git\n", + "!pip install ego4d awscli numpy opencv-python pyqt5 opencv-contrib-python libtorch torchvision torchaudio\n", + "!sudo apt-get install libavcodec-dev libavformat-dev libswscale-dev libv4l-dev\n", + "!sudo apt-get install libxvidcore-dev libx264-dev\n", + "!sudo apt install libgtk2.0-dev liblcm-dev\n", + "!sudo apt-get install liblcm-dev\n", + "!pip install pydub audiosegment" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fbt09eafpoET" + }, + "source": [ + "##Clone Repository & download the videos" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "o77Hf1uJ8Fei" + }, + "outputs": [], + "source": [ + "# Create the new egocentric directory\n", + "!mkdir egocentric\n", + "%cd /content/egocentric\n", + "\n", + "# Clone the audio-visualrepo\n", + "!git clone https://github.com/ashneet1/audio-visual.git\n", + "%cd /content/egocentric/audio-visual\n", + "!mkdir data\n", + "\n", + "# List the video uids to download\n", + "!touch video_uids.txt\n", + "!echo \"0b4cacb1-970f-4ef0-85da-371d81f899e0\" >> video_uids.txt\n", + "!echo \"c2413391-7c1b-4fd6-8b1d-98ee7888b9f8\" >> video_uids.txt\n", + "!echo \"fe69a78e-7773-45d1-9e0f-bacee52dac83\" >> video_uids.txt\n", + "!echo \"3b79017c-4d42-40fc-a1bb-4a20bc8ebca7\" >> video_uids.txt\n", + "!echo \"6dbfc053-7899-40d8-9827-0ccd21f3ee0a\" >> video_uids.txt\n", + "!echo \"7e6dfd31-8544-4fad-9e49-0f05516cf8cf\" >> video_uids.txt\n", + "!echo \"56c5af79-f9d4-478d-96ef-6d71e0bbbdfe\" >> video_uids.txt\n", + "!echo \"d97bedc8-72df-43be-a55b-4da1ae42dfd1\" >> video_uids.txt\n", + "!echo \"f0cb79ef-c081-4049-85ef-2623e02c9589\" >> video_uids.txt\n", + "!echo \"08b0935e-6260-4bd6-86ca-f6fc54e388be\" >> video_uids.txt\n", + "!echo \"6b34c327-000c-42b6-b242-d3dca63a7508\" >> video_uids.txt\n", + "!echo \"076bdb81-5c75-4282-9f3a-a387624575f3\" >> video_uids.txt\n", + "\n", + "# Configure aws cli to be able to access the ego4d dataset\n", + "!aws configure\n", + "\n", + "# Download ego4d model, annotation, and videos\n", + "!ego4d -y --output_directory ./data --datasets av_models clips annotations --benchmarks av --video_uid_file video_uids.txt\n", + "!tar xf data/v2/av_models/pretrained_av_models.tar.gz\n", + "!mv data/v2/annotations/* utils/ground_truth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install libtorch\n", + "%cd /content/egocentric\n", + "!wget \"https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.0%2Bcu118.zip\"\n", + "!unzip \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"\n", + "!rm -rf \"libtorch-cxx11-abi-shared-with-deps-2.1.0+cu118.zip\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mlbjm9kPvqLr" + }, + "source": [ + "#EGO4D Audio-Visual Diarization Baseline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d4opZblNpyHC" + }, + "source": [ + "##Preprocess ground truth data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IM6020Xg8aSF" + }, + "outputs": [], + "source": [ + "# Preprocess ground truth data\n", + "%cd /content/egocentric/audio-visual/utils/ground_truth\n", + "!bash init_dirs.sh\n", + "!python3 extract_clipnames_and_split_indices.py\n", + "!python3 extract_boxes_and_speakers.py\n", + "!python3 make_mot_ground_truth.py ../../data/v2/clips val\n", + "!mv tracking_evaluation/mot_challenge ../../tracking/tracking_evaluation/data/gt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jt7yzV648gOr", + "outputId": "acf51bc2-fb54-4769-c20a-5e7d1b53d25f" + }, + "outputs": [], + "source": [ + "#Run visualize_ground_truth.py (It downloads the output video to the current directory)\n", + "%cd /content/egocentric/audio-visual/utils/ground_truth/\n", + "!python3 visualize_ground_truth.py /content/egocentric/audio-visual/data/v2/clips 0b4cacb1-970f-4ef0-85da-371d81f899e0 #This is 389" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c2fNByhuqTHn" + }, + "source": [ + "##Localization & Tracking" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZPQT5dMUqZjo" + }, + "source": [ + "###People Detection Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZqOp5PWA8nNR" + }, + "outputs": [], + "source": [ + "# People detection setup\n", + "# https://github.com/EGO4D/audio-visual/blob/main/tracking/README.md#people-detection\n", + "%cd /content/egocentric/audio-visual/tracking/people_detection\n", + "\n", + "# Replace the lines in the makefile to use opencv4 instead of opencv3\n", + "!sed -i '44s/.*/LDFLAGS+= `pkg-config --libs opencv4` -lstdc++/' Makefile\n", + "!sed -i '45s/.*/COMMON+= `pkg-config --cflags opencv4`/' Makefile\n", + "\n", + "#Specifying the arch\n", + "!sed -i '14s/.*/ARCH= -gencode arch=compute_75,code=sm_75/' Makefile\n", + "\n", + "# Add missing headers required to build using opencv4\n", + "# https://stackoverflow.com/questions/64885148/error-iplimage-does-not-name-a-type-when-trying-to-build-darknet-with-opencv\n", + "!sed -i '3 i #include \"opencv2/core/core_c.h\"' src/image_opencv.cpp\n", + "!sed -i '3 i #include \"opencv2/videoio/legacy/constants_c.h\"' src/image_opencv.cpp\n", + "!sed -i '3 i #include \"opencv2/highgui/highgui_c.h\"' src/image_opencv.cpp\n", + "\n", + "!sed -i '3 i #include \"opencv2/core/core_c.h\"' src/image_opencv.hpp\n", + "!sed -i '3 i #include \"opencv2/videoio/legacy/constants_c.h\"' src/image_opencv.hpp\n", + "!sed -i '3 i #include \"opencv2/highgui/highgui_c.h\"' src/image_opencv.hpp\n", + "\n", + "!make -j" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vqidCROwqfJA" + }, + "source": [ + "###Short Term Tracking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2gpLUYd68vE2" + }, + "outputs": [], + "source": [ + "# Short term tracking setup\n", + "# https://github.com/EGO4D/audio-visual/blob/main/tracking/README.md#short_term_tracking\n", + "%cd /content/egocentric/audio-visual/tracking/short_term_tracking\n", + "\n", + "# Modify line 13 in the CMake file to include the opencv4 directory\n", + "# https://stackoverflow.com/questions/58478074/how-to-fix-fatal-error-opencv2-core-hpp-no-such-file-or-directory-for-opencv\n", + "!sed -i '13s,.*,include_directories( /usr/local/include /usr/local/cuda/include /usr/include/opencv4/ ),' CMakeLists.txt\n", + "\n", + "# Modify line 17 in the CMake file to fix a compilation error\n", + "# https://github.com/pytorch/pytorch/issues/103371\n", + "!sed -i '17s,.*,set_property(TARGET short_term_tracker PROPERTY CXX_STANDARD 17),' CMakeLists.txt\n", + "!mkdir build\n", + "%cd build\n", + "!cmake -DCMAKE_PREFIX_PATH=/content/egocentric/libtorch ..\n", + "!make" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jIfxzYDgqlkf" + }, + "source": [ + "###Run Global People Tracking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D-a-ez1A85EB" + }, + "outputs": [], + "source": [ + "#Global People Tracking\n", + "%cd /content/egocentric/audio-visual/tracking\n", + "!python3 single_run.py /content/egocentric/audio-visual/data/v2/clips 438" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uG8NV_oUrDZY" + }, + "source": [ + "##Voice Activity Detection (VAD)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SxlxvS0Y81BU" + }, + "outputs": [], + "source": [ + "#Voice Activity Audio Detection\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/vad\n", + "!python3 extract_all_audio.py /content/egocentric/audio-visual/data/v2/clips\n", + "!python3 vad.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rVr2OzKIrRBY" + }, + "source": [ + "##Active Speaker Detection (ASD)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2BN1eTm_rePW" + }, + "source": [ + "####Mouth Region Classification (MRC)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ipOe-4y1883Q" + }, + "outputs": [], + "source": [ + "# Active Speaker Detection (ASD)\n", + "# Mouth region classification (MRC)\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/active_speaker/mrc_active_speaker_detection/prediction\n", + "\n", + "# Modify line 15 and 17 in the CMake file to include the opencv4 directory\n", + "# https://stackoverflow.com/questions/58478074/how-to-fix-fatal-error-opencv2-core-hpp-no-such-file-or-directory-for-opencv\n", + "!sed -i '15s,.*,link_directories( /usr/local/lib /usr/local/cuda/lib64 /usr/include/opencv4/ ),' CMakeLists.txt\n", + "!sed -i '17s,.*,include_directories( /usr/local/include /usr/local/cuda/include /usr/local/cuda/targets/x86_64-linux/include /usr/include/opencv4/ ),' CMakeLists.txt\n", + "\n", + "# Modify line 21 in the CMake file to fix a compilation error\n", + "# https://github.com/pytorch/pytorch/issues/103371\n", + "!sed -i '21s,.*,set_property(TARGET mrc PROPERTY CXX_STANDARD 17),' CMakeLists.txt\n", + "# Build MRC tracking code\n", + "!mkdir build\n", + "%cd build\n", + "!cmake -DCMAKE_PREFIX_PATH=/content/egocentric/libtorch ..\n", + "!make" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FcFzpREn8_1P" + }, + "outputs": [], + "source": [ + "#Running the MRC\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/active_speaker/mrc_active_speaker_detection/prediction\n", + "!python3 run_once.py /content/egocentric/audio-visual/data/v2/clips ego4d 389" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PY4Mha_QrjV-" + }, + "source": [ + "##Audio Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_OzROruu9CVO" + }, + "outputs": [], + "source": [ + "#Voice Embedding\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/audio_embedding/make_audio_embeddings\n", + "!python3 batch_audio_embedding.py /content/egocentric/audio-visual/data/v2/clips val" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XXUYRiQ1rueJ" + }, + "source": [ + "##Device wearer voice activity detection" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kssnc4VNr4ra" + }, + "source": [ + "####Energy Based Method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LMZ3jbsM9E5c" + }, + "outputs": [], + "source": [ + "#Wearer: energy based method\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/wearer/energy_based\n", + "!python3 short_time_energy.py /content/egocentric/audio-visual/data/v2/clips val\n", + "!python3 match_wearer_audio.py val" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pQnNzUoasJjx" + }, + "source": [ + "##Surrounding people voice matching (MRC)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wOTCIPGi9Mpn" + }, + "outputs": [], + "source": [ + "#Surrounding People Audio Matching: MRC\n", + "%cd /content/egocentric/audio-visual/active-speaker-detection/surrounding_people_audio_matching/mrc\n", + "!python3 match_audio.py /content/egocentric/audio-visual/active-speaker-detection/active_speaker/mrc_active_speaker_detection/prediction/results val" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7rVXK-W5b_Nu" + }, + "source": [ + "#Transcription\n", + "- Need to move \"av_test_unannotated.json\",\"av_train.json\", and \"av_val.json\" from /content/egocentric/audio-visual/utils/ground_truth to /content/egocentric/audio-visual/data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gWtnMbJrbybQ" + }, + "outputs": [], + "source": [ + "#Install Miniconda\n", + "%cd /content/\n", + "#https://www.kaggle.com/code/alaajah/creating-virtual-environment-on-google-colab\n", + "!wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n", + "!chmod +x Miniconda3-latest-Linux-x86_64.sh\n", + "!./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local\n", + "!conda install -q -y --prefix /usr/local python=3.8.10 ujson" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IvLBi0e_cEq8" + }, + "outputs": [], + "source": [ + "#Install Sclite\n", + "%cd /content/\n", + "!git clone https://github.com/usnistgov/SCTK.git\n", + "%cd SCTK\n", + "! make config\n", + "! make all\n", + "! make check\n", + "! make install\n", + "! make doc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "To36j1F_cLIC" + }, + "outputs": [], + "source": [ + "#Activate Environment\n", + "%cd /content/drive/MyDrive/egocentric/audio-visual/transcription\n", + "!conda create --name transcription_env --file requirements_38_10.txt\n", + "!pip install soundfile\n", + "!pip install torch\n", + "!pip install espnet_model_zoo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d7do0eXVcQRr" + }, + "outputs": [], + "source": [ + "#Extract 16kHz single channel audio files in wav format from videos\n", + "%cd /content/egocentric/audio-visual/data/v2/clips\n", + "!mkdir wavs_16000\n", + "%cd /content/egocentric/audio-visual/transcription\n", + "!chmod +x extract_wav.sh\n", + "!./extract_wav.sh /content/egocentric/audio-visual/data/v2/clips /content/egocentric/audio-visual/data/v2/wavs_16000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "okmVTCt4cadk" + }, + "outputs": [], + "source": [ + "#Extract transcriptions from the annotation files, decode audio and score the decoding output\n", + "%cd /content/egocentric/audio-visual/transcription\n", + "!pip install torchaudio\n", + "!chmod +x score_asr.sh\n", + "!./score_asr.sh /content/egocentric/audio-visual/transcription/output 1" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "V100", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/extensions/labgraph_diarization/README.md b/extensions/labgraph_diarization/README.md new file mode 100644 index 00000000..58c7d7f8 --- /dev/null +++ b/extensions/labgraph_diarization/README.md @@ -0,0 +1,6 @@ +# EGO4D Audio-Visual Diarization Benchmark Colab + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UBGYBNrjJckCb0KoyQ6xXea79kuoLgu6#scrollTo=lhUO_Ov678bm) + +The Colab downloads a subset of video clips from the EGO4D Dataset and performs Audio-Visual Diarization and Transcription with the [EGO4D Audio Visual Diarization Benchmark](https://github.com/EGO4D/audio-visual/blob/main/diarization/audio-visual/README.md). In order to access the dataset you will need to review the liscense agreement and accept the terms for [EGO4D](https://ego4d-data.org/docs/start-here/). +