diff --git a/diag.bib b/diag.bib
index 3559b11..16b1722 100644
--- a/diag.bib
+++ b/diag.bib
@@ -1369,21 +1369,21 @@ @article{Alex21
   url = {http://dx.doi.org/10.1093/cid/ciaa1855},
   volume = {72},
   abstract = {Abstract
-
-                                           Background
-                                           Clinical imaging in suspected invasive fungal disease (IFD) has a significant role in early detection of disease and helps direct further testing and treatment. Revised definitions of IFD from the EORTC/MSGERC were recently published and provide clarity on the role of imaging for the definition of IFD. Here, we provide evidence to support these revised diagnostic guidelines.
-
-
-                                           Methods
-                                           We reviewed data on imaging modalities and techniques used to characterize IFDs.
-
-
-                                           Results
-                                           Volumetric high-resolution computed tomography (CT) is the method of choice for lung imaging. Although no CT radiologic pattern is pathognomonic of IFD, the halo sign, in the appropriate clinical setting, is highly suggestive of invasive pulmonary aspergillosis (IPA) and associated with specific stages of the disease. The ACS is not specific for IFD and occurs in the later stages of infection. By contrast, the reversed halo sign and the hypodense sign are typical of pulmonary mucormycosis but occur less frequently. In noncancer populations, both invasive pulmonary aspergillosis and mucormycosis are associated with "atypical" nonnodular presentations, including consolidation and ground-glass opacities.
-
-
-                                           Conclusions
-                                           A uniform definition of IFD could improve the quality of clinical studies and aid in differentiating IFD from other pathology in clinical practice. Radiologic assessment of the lung is an important component of the diagnostic work-up and management of IFD. Periodic review of imaging studies that characterize findings in patients with IFD will inform future diagnostic guidelines.},
+  
+                                             Background
+                                             Clinical imaging in suspected invasive fungal disease (IFD) has a significant role in early detection of disease and helps direct further testing and treatment. Revised definitions of IFD from the EORTC/MSGERC were recently published and provide clarity on the role of imaging for the definition of IFD. Here, we provide evidence to support these revised diagnostic guidelines.
+  
+  
+                                             Methods
+                                             We reviewed data on imaging modalities and techniques used to characterize IFDs.
+  
+  
+                                             Results
+                                             Volumetric high-resolution computed tomography (CT) is the method of choice for lung imaging. Although no CT radiologic pattern is pathognomonic of IFD, the halo sign, in the appropriate clinical setting, is highly suggestive of invasive pulmonary aspergillosis (IPA) and associated with specific stages of the disease. The ACS is not specific for IFD and occurs in the later stages of infection. By contrast, the reversed halo sign and the hypodense sign are typical of pulmonary mucormycosis but occur less frequently. In noncancer populations, both invasive pulmonary aspergillosis and mucormycosis are associated with "atypical" nonnodular presentations, including consolidation and ground-glass opacities.
+  
+  
+                                             Conclusions
+                                             A uniform definition of IFD could improve the quality of clinical studies and aid in differentiating IFD from other pathology in clinical practice. Radiologic assessment of the lung is an important component of the diagnostic work-up and management of IFD. Periodic review of imaging studies that characterize findings in patients with IFD will inform future diagnostic guidelines.},
   all_ss_ids = {[fbdd91432a4cd3077d88a4ce20a1e7c1bdcbab05]},
   automatic = {yes},
   citation-count = {32},
@@ -1421,10 +1421,10 @@ @article{Alve23
   url = {http://dx.doi.org/10.1148/radiol.230275},
   volume = {308},
   abstract = {Background:A priori identification of patients at risk of artificial intelligence (AI) failure in diagnosing cancer would contribute to the safer clinical integration of diagnostic algorithms.
-                          	Purpose:To evaluate AI prediction variability as an uncertainty quantification (UQ) metric for identifying cases at risk of AI failure in diagnosing cancer at MRI and CT across different cancer types, data sets, and algorithms.
-                          	Materials and Methods:Multicenter data sets and publicly available AI algorithms from three previous studies that evaluated detec-tion of pancreatic cancer on contrast-enhanced CT images, detection of prostate cancer on MRI scans, and prediction of pulmo-nary nodule malignancy on low-dose CT images were analyzed retrospectively. Each task's algorithm was extended to generate an uncertainty score based on ensemble prediction variability. AI accuracy percentage and partial area under the receiver operating characteristic curve (pAUC) were compared between certain and uncertain patient groups in a range of percentile thresholds (10%-90%) for the uncertainty score using permutation tests for statistical significance. The pulmonary nodule malignancy prediction algorithm was compared with 11 clinical readers for the certain group (CG) and uncertain group (UG).
-                          	Results:In total, 18 022 images were used for training and 838 images were used for testing. AI diagnostic accuracy was higher for the cases in the CG across all tasks (P < .001). At an 80% threshold of certain predictions, accuracy in the CG was 21%-29% higher than in the UG and 4%-6% higher than in the overall test data sets. The lesion-level pAUC in the CG was 0.25-0.39 higher than in the UG and 0.05-0.08 higher than in the overall test data sets (P < .001). For pulmonary nodule malignancy prediction, accuracy of AI was on par with clinicians for cases in the CG (AI results vs clinician results, 80% [95% CI: 76, 85] vs 78% [95% CI: 70, 87]; P = .07) but worse for cases in the UG (AI results vs clinician results, 50% [95% CI: 37, 64] vs 68% [95% CI: 60, 76]; P < .001).
-                          	Conclusion:An AI-prediction UQ metric consistently identified reduced performance of AI in cancer diagnosis.},
+                            	Purpose:To evaluate AI prediction variability as an uncertainty quantification (UQ) metric for identifying cases at risk of AI failure in diagnosing cancer at MRI and CT across different cancer types, data sets, and algorithms.
+                            	Materials and Methods:Multicenter data sets and publicly available AI algorithms from three previous studies that evaluated detec-tion of pancreatic cancer on contrast-enhanced CT images, detection of prostate cancer on MRI scans, and prediction of pulmo-nary nodule malignancy on low-dose CT images were analyzed retrospectively. Each task's algorithm was extended to generate an uncertainty score based on ensemble prediction variability. AI accuracy percentage and partial area under the receiver operating characteristic curve (pAUC) were compared between certain and uncertain patient groups in a range of percentile thresholds (10%-90%) for the uncertainty score using permutation tests for statistical significance. The pulmonary nodule malignancy prediction algorithm was compared with 11 clinical readers for the certain group (CG) and uncertain group (UG).
+                            	Results:In total, 18 022 images were used for training and 838 images were used for testing. AI diagnostic accuracy was higher for the cases in the CG across all tasks (P < .001). At an 80% threshold of certain predictions, accuracy in the CG was 21%-29% higher than in the UG and 4%-6% higher than in the overall test data sets. The lesion-level pAUC in the CG was 0.25-0.39 higher than in the UG and 0.05-0.08 higher than in the overall test data sets (P < .001). For pulmonary nodule malignancy prediction, accuracy of AI was on par with clinicians for cases in the CG (AI results vs clinician results, 80% [95% CI: 76, 85] vs 78% [95% CI: 70, 87]; P = .07) but worse for cases in the UG (AI results vs clinician results, 50% [95% CI: 37, 64] vs 68% [95% CI: 60, 76]; P < .001).
+                            	Conclusion:An AI-prediction UQ metric consistently identified reduced performance of AI in cancer diagnosis.},
   all_ss_ids = {[740d5d34fcd714870ddf0073fd8956db023319f0]},
   automatic = {yes},
   citation-count = {1},
@@ -1441,7 +1441,7 @@ @conference{Alves21a
   booktitle = RSNA,
   title = {CT-based Deep Learning Towards Early Detection Of Pancreatic Ductal Adenocarcinoma},
   abstract = {Purpose: To investigate the performance of a 3D nnUnet based algorithm for pancreatic ductal adenocarcinoma (PDAC)detection and assess the potential of the model for early diagnosis by conducting a subgroup analysis on small (size <2cm) tumors. Methods and Materials: Portal-venous phase contrast-enhanced computed tomography (CE-CT) scans from a cohort of119 patients with pathology-proven PDAC and 122 consecutive patients with normal pancreas were included in thisretrospective study. For the PDAC cohort, expert segmentations of the pancreas and tumor volumes were available, alongwith the tumor sizes measured on the CT scan. For the non-PDAC cohort, the pancreas segmentations were obtained usinga pre-trained deep learning segmentation model. The pancreas segmentation determined a region of interest from the fullCE-CT as input to the 3D nnUnet.
-                             The network was trained for 1000 epochs with 5-fold cross-validation to differentiatebetween tumor and normal voxels. The predicted heatmaps were thresholded at 0.1. An image was considered a positivecase of PDAC if the predicted tumor volume was greater than 100 mm3. Results: The median tumor size on the PDAC cohort was 2.8 cm (range 1.2 cm - 9.3 cm). The detection task achieved anaverage sensitivity of 0.93 +- 0.04 (111/119), specificity of 0.98 +- 0.02 (119/122) and area under the receiver operatingcharacteristic curve of 0.96 +- 0.04. The median DICE score between the expert and the network tumor segmentations was0.68 +- 0.18. In 2 of the 3 false positive cases the network wrongly detected a hypodense region of the normal pancreas,which could be originated by fat accumulation or natural perfusion differences. The mean sensitivity in the sub-group oftumors with size smaller than 2 cm was 0.92 +- 0.1 (21/23), and the median DICE score in this sub-group was 0.56 +- 0.20. Conclusions: These preliminary results indicate that a 3D nnUnet based algorithm can accurately detect small tumors,suggesting that it could be useful at assisting in early PDAC diagnosis. Clinical Relevance/Application: Early diagnosis improves pancreatic cancer prognosis but requires significant expertise.An automatic tool for the detection of early-stage tumors would reduce expertise requirements.},
+                               The network was trained for 1000 epochs with 5-fold cross-validation to differentiatebetween tumor and normal voxels. The predicted heatmaps were thresholded at 0.1. An image was considered a positivecase of PDAC if the predicted tumor volume was greater than 100 mm3. Results: The median tumor size on the PDAC cohort was 2.8 cm (range 1.2 cm - 9.3 cm). The detection task achieved anaverage sensitivity of 0.93 +- 0.04 (111/119), specificity of 0.98 +- 0.02 (119/122) and area under the receiver operatingcharacteristic curve of 0.96 +- 0.04. The median DICE score between the expert and the network tumor segmentations was0.68 +- 0.18. In 2 of the 3 false positive cases the network wrongly detected a hypodense region of the normal pancreas,which could be originated by fat accumulation or natural perfusion differences. The mean sensitivity in the sub-group oftumors with size smaller than 2 cm was 0.92 +- 0.1 (21/23), and the median DICE score in this sub-group was 0.56 +- 0.20. Conclusions: These preliminary results indicate that a 3D nnUnet based algorithm can accurately detect small tumors,suggesting that it could be useful at assisting in early PDAC diagnosis. Clinical Relevance/Application: Early diagnosis improves pancreatic cancer prognosis but requires significant expertise.An automatic tool for the detection of early-stage tumors would reduce expertise requirements.},
   optnote = {DIAG, RADIOLOGY},
   year = {2021},
 }
@@ -1562,10 +1562,10 @@ @conference{Anto23
   booktitle = ECR,
   title = {Retrospective validation of nodule management based on deep learning-based malignancy thresholds in lung cancer screening},
   abstract = {Purpose: We previously developed and validated a deep learning (DL) algorithm for malignancy risk estimation of screen-detected nodules. The nodule risk cut-off for a positive screen, triggering more intensive follow-up (either short-term follow-up, PET-CT or biopsy), varies in existing nodule management protocols; 1-2% for Lung-RADS (cat 3), 6% for PanCan2b (CAT3). In this study, we investigated two DL-based malignancy thresholds to define a positive screen, compared to existing nodule management protocols.
-                          Methods and materials: All baseline CT-scans from the Danish Lung Cancer Screening Trial were linked to lung cancer diagnosis within 2 years, resulting in 2,019 non-cancer and 18 cancer cases. The DL-based malignancy risk was computed for all screen-detected nodules using two malignancy risk cut-off points (6% and 10%), as threshold for a positive screen. For both Lung-RADS and PanCan2b, we used the published nodule-risk cut-offs for a positive screen. Sensitivity and False Positive Rate (FPR) were calculated for all baseline scans (n=2,037) using the risk dominant nodule per scan.
-                          Results: At a threshold of 6%, DL achieved the highest sensitivity with 88.9% compared to 83.3% of Lung-RADS and 77.8% with PanCan2b. DL and PanCan2b yielded comparable FPR of 3.6% and 4.1%, respectively, while Lung-RADS had a higher FPR of 8.7%. Increasing the DL threshold to >=10% resulted in a sensitivity of 88.9%, and a FPR of 2.5%.
-                          Conclusion: DL-based nodule risk cut-offs achieved the highest sensitivity and lowest FPR for defining a positive screen, triggering more intense diagnostic work-up. Increasing the risk cut-off from 6% to 10% further decreased the FPR without alteration of sensitivity.
-                          Limitations: This study is a retrospective analysis on data from one screening trial and one screening round. More external validation is needed, including validation for incidence screenings.},
+                            Methods and materials: All baseline CT-scans from the Danish Lung Cancer Screening Trial were linked to lung cancer diagnosis within 2 years, resulting in 2,019 non-cancer and 18 cancer cases. The DL-based malignancy risk was computed for all screen-detected nodules using two malignancy risk cut-off points (6% and 10%), as threshold for a positive screen. For both Lung-RADS and PanCan2b, we used the published nodule-risk cut-offs for a positive screen. Sensitivity and False Positive Rate (FPR) were calculated for all baseline scans (n=2,037) using the risk dominant nodule per scan.
+                            Results: At a threshold of 6%, DL achieved the highest sensitivity with 88.9% compared to 83.3% of Lung-RADS and 77.8% with PanCan2b. DL and PanCan2b yielded comparable FPR of 3.6% and 4.1%, respectively, while Lung-RADS had a higher FPR of 8.7%. Increasing the DL threshold to >=10% resulted in a sensitivity of 88.9%, and a FPR of 2.5%.
+                            Conclusion: DL-based nodule risk cut-offs achieved the highest sensitivity and lowest FPR for defining a positive screen, triggering more intense diagnostic work-up. Increasing the risk cut-off from 6% to 10% further decreased the FPR without alteration of sensitivity.
+                            Limitations: This study is a retrospective analysis on data from one screening trial and one screening round. More external validation is needed, including validation for incidence screenings.},
   optnote = {DIAG, RADIOLOGY},
   year = {2023},
 }
@@ -1575,10 +1575,10 @@ @conference{Anto23a
   booktitle = ESTI,
   title = {Retrospective identification of low-risk individuals eligible for biennial lung cancer screening using PanCan-based and deep learning-based risk thresholds},
   abstract = {Purpose: Current nodule management protocols for managing negative screening results have varying follow-up intervals; LungRADS recommends a 1-year screening interval for all negative screens (category 1/2), while the International Lung Screen Trial (ILST) protocol recommends 1-year interval for participants with indeterminate nodules (PanCan score 1.5% - 6%) and 2-year interval for participants with no or very low risk nodules (PanCan score < 1.5%). In this study, we retrospectively evaluated the use of PanCan and DL-based malignancy thresholds to identify individuals eligible for biennial screening, aiming to reduce screening-related harms and enhancing cost-effectiveness without causing potential delay of cancer diagnosis.
-                        Methods and materials: All baseline CT-scans from the Danish Lung Cancer Screening Trial (DLCST) and Multicentric Italian Lung Detection (MILD) were pooled and linked to a lung cancer diagnosis within 2 years, resulting in 4.157 non-cancer and 53 cancer cases. PanCan1a and DL-based malignancy risk scores were calculated for all screen-annotated nodules. For cases with no screen-annotated nodules, the risk score for participants was set as 0%. For both risk calculators, we used a nodule-risk cut-off of < 1.5% to identify low-risk participants for biennial follow-up, based on the ILST protocol. We computed the number of low-risk participants eligible for biennial screening for all included baseline scans (n=4.210) using the risk dominant nodule per scan and calculated the number of cancer cases in the biennial group.
-                        Results: The DL-based and PanCan-based risk threshold < 1.5% identified 3.729 and 3.720 individuals, respectively, meeting the criteria for biennial screening. This would result in a reduction of 88.6% and 88.4% of the scans in the second screening round, respectively. The group referred for biennial screening included 14 and 16 cancers with DL and PanCan-based risk scores <1.5%, respectively. Most of the cancer cases (n=13), had no nodule annotated at baseline CT, leading to a 0% risk score at baseline. Retrospectively 4 of the 13 cancers were visible in the baseline scan, yet primarily not annotated by the screening radiologist.
-                        Conclusion: Risk threshold-based identification of low-risk subjects for biennial screening largely reduces the number of 1-year follow-up scans. DL and PanCan for risk assessment performed very similarly, indicating the potential of DL for readily available risk assessment of baseline scans. A risk threshold of < 1.5%, as implemented in the ILST protocol, leads to delayed diagnosis of cancers either primarily missed during baseline or developing as interval cancers. More research is needed to study the type of cancers with delayed diagnosis and whether such delay leads to diagnostic stage shift.
-                        Limitations: This study is a retrospective analysis on data from two screening trials, restricted to the baseline round.},
+                          Methods and materials: All baseline CT-scans from the Danish Lung Cancer Screening Trial (DLCST) and Multicentric Italian Lung Detection (MILD) were pooled and linked to a lung cancer diagnosis within 2 years, resulting in 4.157 non-cancer and 53 cancer cases. PanCan1a and DL-based malignancy risk scores were calculated for all screen-annotated nodules. For cases with no screen-annotated nodules, the risk score for participants was set as 0%. For both risk calculators, we used a nodule-risk cut-off of < 1.5% to identify low-risk participants for biennial follow-up, based on the ILST protocol. We computed the number of low-risk participants eligible for biennial screening for all included baseline scans (n=4.210) using the risk dominant nodule per scan and calculated the number of cancer cases in the biennial group.
+                          Results: The DL-based and PanCan-based risk threshold < 1.5% identified 3.729 and 3.720 individuals, respectively, meeting the criteria for biennial screening. This would result in a reduction of 88.6% and 88.4% of the scans in the second screening round, respectively. The group referred for biennial screening included 14 and 16 cancers with DL and PanCan-based risk scores <1.5%, respectively. Most of the cancer cases (n=13), had no nodule annotated at baseline CT, leading to a 0% risk score at baseline. Retrospectively 4 of the 13 cancers were visible in the baseline scan, yet primarily not annotated by the screening radiologist.
+                          Conclusion: Risk threshold-based identification of low-risk subjects for biennial screening largely reduces the number of 1-year follow-up scans. DL and PanCan for risk assessment performed very similarly, indicating the potential of DL for readily available risk assessment of baseline scans. A risk threshold of < 1.5%, as implemented in the ILST protocol, leads to delayed diagnosis of cancers either primarily missed during baseline or developing as interval cancers. More research is needed to study the type of cancers with delayed diagnosis and whether such delay leads to diagnostic stage shift.
+                          Limitations: This study is a retrospective analysis on data from two screening trials, restricted to the baseline round.},
   optnote = {DIAG, RADIOLOGY},
   year = {2023},
 }
@@ -1588,16 +1588,16 @@ @conference{Anto24a
   booktitle = ESTI,
   title = {Retrospective validation and comparison of deep learning based risk thresholds versus growth-centric protocols in pulmonary nodule assessment in screening},
   abstract = {Purpose/Objectives:
-       We previously developed a deep learning (DL) algorithm for estimating malignancy risk in screen-detected nodules using a current and prior low-dose CT scan to assess 3-year malignancy risk of persisting pulmonary nodules. Existing nodule management guidelines have diverse criteria for intensified follow-up actions (short-term follow-up, PET-CT, or biopsy) based on nodule growth. Key criteria include more than 1.5 mm diameter growth within 12 months for Lung-RADS or subsequent scans in the International Lung Screen Trial (ILST) protocol, and 25% volume growth for nodules over 100 mm3 in the updated NELSON protocol, among other more detailed criteria. This study evaluates a deep learning-based method for determining the malignancy risk of persisting nodules in comparison to growth-centric protocols.
-
-       Methods and materials:
-       For this study we used 679 pairs of annual low-dose CT scans from the Danish Lung Cancer Screening Trial. This data set was constructed by selecting scans preceding lung cancer diagnosis for malignant cases and equivalent periods for benign cases in individuals without lung cancer, including 1,116 screen-annotated nodules across 639 non-cancer and 40 cancer cases. The DL-based malignancy risk was computed for all screen-detected nodules using a malignancy risk cut-off point of 5% as threshold for a positive screen. This 5% threshold aligns with the recommendations of the American College of Chest Physicians (ACCP), considering that nodules with less than 5% risk are deemed to have a very low risk of malignancy. We applied the published growth criteria from Lung-RADS, ILST and updated NELSON protocols to define a positive screening outcome. Sensitivity and specificity were calculated for all cases (n=679) using the risk dominant nodule per scan.
-
-       Results:
-       The deep learning (DL) model, with a 5% threshold, achieved a sensitivity of 90%, surpassing Lung-RADS at 77.5% and the NELSON protocol at 82.5%, while closely aligning with ILST at 92.5%. In specificity, the model excelled with 95.5%, outperforming Lung-RADS at 88.3%, ILST at 80.9%, and the NELSON protocol at 88%.
-
-       Conclusion :
-       In this study, a deep learning (DL) algorithm for lung cancer screening, which analyses both current and prior low-dose CT scans, showed notable performance by achieving a sensitivity of 90%, exceeding Lung-RADS, NELSON, and closely rivalling ILST. More notably, its specificity of 95.9% surpassed all compared protocols. These findings suggest that the deep learning algorithm considers other factors beyond nodule growth for malignancy risk estimation, highlighting the algorithm's efficiency not only in detecting lung cancer but also in potentially reducing false positives. In conclusion, the integration of prior CT scans coupled with the use of deep learning potentially provides a more precise assessment than traditional growth rate-based protocols.},
+         We previously developed a deep learning (DL) algorithm for estimating malignancy risk in screen-detected nodules using a current and prior low-dose CT scan to assess 3-year malignancy risk of persisting pulmonary nodules. Existing nodule management guidelines have diverse criteria for intensified follow-up actions (short-term follow-up, PET-CT, or biopsy) based on nodule growth. Key criteria include more than 1.5 mm diameter growth within 12 months for Lung-RADS or subsequent scans in the International Lung Screen Trial (ILST) protocol, and 25% volume growth for nodules over 100 mm3 in the updated NELSON protocol, among other more detailed criteria. This study evaluates a deep learning-based method for determining the malignancy risk of persisting nodules in comparison to growth-centric protocols.
+  
+         Methods and materials:
+         For this study we used 679 pairs of annual low-dose CT scans from the Danish Lung Cancer Screening Trial. This data set was constructed by selecting scans preceding lung cancer diagnosis for malignant cases and equivalent periods for benign cases in individuals without lung cancer, including 1,116 screen-annotated nodules across 639 non-cancer and 40 cancer cases. The DL-based malignancy risk was computed for all screen-detected nodules using a malignancy risk cut-off point of 5% as threshold for a positive screen. This 5% threshold aligns with the recommendations of the American College of Chest Physicians (ACCP), considering that nodules with less than 5% risk are deemed to have a very low risk of malignancy. We applied the published growth criteria from Lung-RADS, ILST and updated NELSON protocols to define a positive screening outcome. Sensitivity and specificity were calculated for all cases (n=679) using the risk dominant nodule per scan.
+  
+         Results:
+         The deep learning (DL) model, with a 5% threshold, achieved a sensitivity of 90%, surpassing Lung-RADS at 77.5% and the NELSON protocol at 82.5%, while closely aligning with ILST at 92.5%. In specificity, the model excelled with 95.5%, outperforming Lung-RADS at 88.3%, ILST at 80.9%, and the NELSON protocol at 88%.
+  
+         Conclusion :
+         In this study, a deep learning (DL) algorithm for lung cancer screening, which analyses both current and prior low-dose CT scans, showed notable performance by achieving a sensitivity of 90%, exceeding Lung-RADS, NELSON, and closely rivalling ILST. More notably, its specificity of 95.9% surpassed all compared protocols. These findings suggest that the deep learning algorithm considers other factors beyond nodule growth for malignancy risk estimation, highlighting the algorithm's efficiency not only in detecting lung cancer but also in potentially reducing false positives. In conclusion, the integration of prior CT scans coupled with the use of deep learning potentially provides a more precise assessment than traditional growth rate-based protocols.},
   optnote = {DIAG, RADIOLOGY},
   year = {2024},
 }
@@ -1626,8 +1626,8 @@ @mastersthesis{Arch22
   author = {Anwai Archit and Bram van Ginneken},
   title = {Automated Abdominal Aortic Aneurysm Detection on CT Scans},
   abstract = {Computed tomography (CT) scans enable the detection of local enlargements in the abdominal aorta (AA), resulting to straight-forward quantitative and qualitative understandings, typically instated as abdominal aortic aneurysm (AAA). Although, the segmentation of aorta is disposed to stall in presence of expanded lumen or intraluminal thrombus as a result of insufficient spiteful examples, raising the susceptibility for uneventful outcomes of an aortic rupture.
-                              The motion of this research proposes to develop and validate a fully automated deep learning algorithm to segment and measure AAAs on abdominal CT scans. The computer-aided detection (CAD) model is steered by a self-configuring convolutional neural network (CNN), which plumps for essential decisions in a standardised environment to design the 3D segmentation pipeline, regardless of the dataset diversity in the domain. It uses an additional 3D instance-based vertebral segmentation software bundle for independent vertebrae labelling. It coheres with a post-processing routine to perceive the growth patterns by investigation across the aortic centerline around strong anatomical landmarks. It benefits from supplementary measurement techniques of the maximal diameter and cross-section area for gaining extensive insights of the main characteristics of AAA. The system evaluates the relationship between the AA and vertebra level surface features. Conclusively, it generates a portable document, devised to group the anticipated aneurysmal information.
-                            The 3D CAD system agrees with expert's suggestions about the existence of the aneurysm in 398 institutional images, exhibiting a high capacity to generalize across genders and portions of a full body CT scan using solely radiologist-supported quantitative speculations from the radiology reports. The end-to-end routine achieves an 95.7% dice score coefficient (DSC) on the validation subset for patient-specific cases, indicating a modest agreement with radiologists within an average difference of 0.3 cm in the relative measurement of maximal AAA diameter, thus justifying the possibility of generalizing to the detection of aneurysms using report-based textual information only.},
+                                The motion of this research proposes to develop and validate a fully automated deep learning algorithm to segment and measure AAAs on abdominal CT scans. The computer-aided detection (CAD) model is steered by a self-configuring convolutional neural network (CNN), which plumps for essential decisions in a standardised environment to design the 3D segmentation pipeline, regardless of the dataset diversity in the domain. It uses an additional 3D instance-based vertebral segmentation software bundle for independent vertebrae labelling. It coheres with a post-processing routine to perceive the growth patterns by investigation across the aortic centerline around strong anatomical landmarks. It benefits from supplementary measurement techniques of the maximal diameter and cross-section area for gaining extensive insights of the main characteristics of AAA. The system evaluates the relationship between the AA and vertebra level surface features. Conclusively, it generates a portable document, devised to group the anticipated aneurysmal information.
+                              The 3D CAD system agrees with expert's suggestions about the existence of the aneurysm in 398 institutional images, exhibiting a high capacity to generalize across genders and portions of a full body CT scan using solely radiologist-supported quantitative speculations from the radiology reports. The end-to-end routine achieves an 95.7% dice score coefficient (DSC) on the validation subset for patient-specific cases, indicating a modest agreement with radiologists within an average difference of 0.3 cm in the relative measurement of maximal AAA diameter, thus justifying the possibility of generalizing to the detection of aneurysms using report-based textual information only.},
   file = {Arch22.pdf:pdf\\Arch22.pdf:PDF},
   journal = {Master thesis},
   optnote = {DIAG},
@@ -1641,17 +1641,17 @@ @conference{Ardu20
   title = {Artificial Intelligence for the Classification and Quantification of Reticular Pseudodrusen in Multimodal Retinal Images},
   url = {https://www.euretina.org/congress/amsterdam-2020/virtual-2020-freepapers/},
   abstract = {Purpose:
-                                Reticular pseudodrusen (RPD) are retinal lesions highly correlated with the risk of developing end-stage age-related macular degeneration (AMD) and, therefore, relevant biomarkers for understanding the progression of AMD. Due to the subtle features characterizing RPD, multiple imaging modalities are often necessary to confirm the presence and extension of RPD, considerably increasing the workload of the expert graders. We propose a deep neural network (DNN) architecture that classifies and quantifies RPD using multimodal retinal images.
-                                Setting:
-                                A cross-sectional study that compares the performance of three expert graders with a DNN trained for identifying and quantifying RPD. Conducted on retinal images drawn from the Rotterdam Study, a population-based cohort, in three modalities: color fundus photographs (CFP), fundus autofluorescence images (FAF) and near-infrared reflectance images (NIR).
-                                Methods:
-                                Multimodal images of 278 eyes of 230 patients were retrieved from the Rotterdam Study database. Of those, 72 eyes showed presence of RPD, 108 had soft distinct/indistinct drusen, and 98 had no signs of drusen as confirmed by the Rotterdam Study graders. Delineations of the areas affected with RPD were made in consensus by two human experts using CFP and NIR images simultaneously and were used as reference standard (RS) for RPD area quantification. The data was randomly divided, patient-wise, in training (243) and test (35) sets for model development and evaluation. A DNN was developed for RPD classification and quantification. The proposed DNN is based on an encoder-decoder architecture. The model jointly inputs a set of co-registered retinal image modalities (CFP, NIR, FAF) and outputs a heatmap image containing, per pixel, the likelihood of RPD presence. The 99th percentile of the values contained in this heatmap measures the likelihood of RPD presence. Three independent graders manually delineated RPD in all eyes of the test set based on the CFP and NIR and their performance was compared with the DNN in the tasks of RPD classification and quantification.
-                                Results:
-                                The proposed DNN obtained an area under the receiver operating characteristic curve (AUROC) with 95% confidence interval (CI) of 0.939[0.818-1.0], a sensitivity (SE) of 0.928 and specificity (SP) of 0.809 for the detection of RPD in multimodal imaging. For RPD quantification, the DNN achieved a mean Dice coefficient (DSC) of 0.632+-0.261 and an intra-class correlation (ICC) of 0.676[0.294-0.999]. Comparably, for RPD classification, grader 1 obtained SE/SP pairs of 1.0/0.785, grader 2 of 1.0/0.5 and grader 3 of 1.0/0.785. For RPD quantification, the graders obtained mean DSC of 0.619+-0.196, 0.573+-0.170 and 0.697+-0.157, respectively, and an ICC of 0.721[0.340-0.999], 0.597[0.288-0.999], 0.751[0.294-0.999], respectively. Of the DNN's three false negatives, none of them was correctly classified by the three graders. The model correctly classified RPD in three of the six eyes where graders disagreed and in the only eye where none of the graders found RPD. Overall, 65.1% of the area indicated as RPD by the reference was delineated by at least one grader and only 26.5% of the total was graded as RPD by all experts. The DNN only missed 23.2% of the areas that all three graders identified correctly.
-                                Conclusions:
-                                The proposed DNN showed promising capacities in the tasks of classifying and quantifying RPD lesions on multimodal retinal images. The results show that the model is able to correctly classify and quantify RPD on eyes where lesions are difficult to spot. The probabilistic output of the model allows for the classification of RPD at different levels of confidence and indicates what retinal areas are most likely affected. This is in line with the manual assessment done by the graders. To this point, the model is developed to classify and quantify RPD only on CFP, FAF and NIR. However, introducing other imaging modalities, such as OCT, might help diminish ambiguities in the classification and quantification of this abnormality. Therefore, a future direction for improving the proposed method is to include OCT scans as an additional input to the model. Automatic classification and quantification of RPD using deep learning on multimodal images will enable the automatic and accurate analysis of increasingly large amounts of data for clinical studies and will facilitate AMD screening in the elderly  by decreasing the workload of the expert graders.
-                                Financial Disclosure:
-                                None},
+                                  Reticular pseudodrusen (RPD) are retinal lesions highly correlated with the risk of developing end-stage age-related macular degeneration (AMD) and, therefore, relevant biomarkers for understanding the progression of AMD. Due to the subtle features characterizing RPD, multiple imaging modalities are often necessary to confirm the presence and extension of RPD, considerably increasing the workload of the expert graders. We propose a deep neural network (DNN) architecture that classifies and quantifies RPD using multimodal retinal images.
+                                  Setting:
+                                  A cross-sectional study that compares the performance of three expert graders with a DNN trained for identifying and quantifying RPD. Conducted on retinal images drawn from the Rotterdam Study, a population-based cohort, in three modalities: color fundus photographs (CFP), fundus autofluorescence images (FAF) and near-infrared reflectance images (NIR).
+                                  Methods:
+                                  Multimodal images of 278 eyes of 230 patients were retrieved from the Rotterdam Study database. Of those, 72 eyes showed presence of RPD, 108 had soft distinct/indistinct drusen, and 98 had no signs of drusen as confirmed by the Rotterdam Study graders. Delineations of the areas affected with RPD were made in consensus by two human experts using CFP and NIR images simultaneously and were used as reference standard (RS) for RPD area quantification. The data was randomly divided, patient-wise, in training (243) and test (35) sets for model development and evaluation. A DNN was developed for RPD classification and quantification. The proposed DNN is based on an encoder-decoder architecture. The model jointly inputs a set of co-registered retinal image modalities (CFP, NIR, FAF) and outputs a heatmap image containing, per pixel, the likelihood of RPD presence. The 99th percentile of the values contained in this heatmap measures the likelihood of RPD presence. Three independent graders manually delineated RPD in all eyes of the test set based on the CFP and NIR and their performance was compared with the DNN in the tasks of RPD classification and quantification.
+                                  Results:
+                                  The proposed DNN obtained an area under the receiver operating characteristic curve (AUROC) with 95% confidence interval (CI) of 0.939[0.818-1.0], a sensitivity (SE) of 0.928 and specificity (SP) of 0.809 for the detection of RPD in multimodal imaging. For RPD quantification, the DNN achieved a mean Dice coefficient (DSC) of 0.632+-0.261 and an intra-class correlation (ICC) of 0.676[0.294-0.999]. Comparably, for RPD classification, grader 1 obtained SE/SP pairs of 1.0/0.785, grader 2 of 1.0/0.5 and grader 3 of 1.0/0.785. For RPD quantification, the graders obtained mean DSC of 0.619+-0.196, 0.573+-0.170 and 0.697+-0.157, respectively, and an ICC of 0.721[0.340-0.999], 0.597[0.288-0.999], 0.751[0.294-0.999], respectively. Of the DNN's three false negatives, none of them was correctly classified by the three graders. The model correctly classified RPD in three of the six eyes where graders disagreed and in the only eye where none of the graders found RPD. Overall, 65.1% of the area indicated as RPD by the reference was delineated by at least one grader and only 26.5% of the total was graded as RPD by all experts. The DNN only missed 23.2% of the areas that all three graders identified correctly.
+                                  Conclusions:
+                                  The proposed DNN showed promising capacities in the tasks of classifying and quantifying RPD lesions on multimodal retinal images. The results show that the model is able to correctly classify and quantify RPD on eyes where lesions are difficult to spot. The probabilistic output of the model allows for the classification of RPD at different levels of confidence and indicates what retinal areas are most likely affected. This is in line with the manual assessment done by the graders. To this point, the model is developed to classify and quantify RPD only on CFP, FAF and NIR. However, introducing other imaging modalities, such as OCT, might help diminish ambiguities in the classification and quantification of this abnormality. Therefore, a future direction for improving the proposed method is to include OCT scans as an additional input to the model. Automatic classification and quantification of RPD using deep learning on multimodal images will enable the automatic and accurate analysis of increasingly large amounts of data for clinical studies and will facilitate AMD screening in the elderly  by decreasing the workload of the expert graders.
+                                  Financial Disclosure:
+                                  None},
   month = {9},
   optnote = {DIAG, RADIOLOGY},
   year = {2020},
@@ -1774,9 +1774,9 @@ @article{Arnt16
   year = {2016},
   doi = {10.1212/WNL.0000000000003123},
   abstract = {Objective: To study the long-term prevalence of small vessel disease after young stroke and to compare this to healthy controls.
-                                                        Methods: This prospective cohort study comprises 337 patients with an ischemic stroke or TIA, aged 18-50 years without a history of TIA or stroke. In addition 90 age and sex matched controls were included. At follow-up lacunes, microbleeds and white matter hyperintensitie (WMH) volume were assessed using MRI. To investigate the relation between riks factors and small vessel disease, logistic and linear regression were used.
-                                                        Results: After mean follow-up of 9.9 (SD 8.1) years, 337 patients were included (227 with an ischemic stroke and 110 with a TIA). Mean age for patients was 49.8 (SD 10.3) years and 45.4% were men, for controls mean age was 49.4 (SD 11.9) and 45.6% were men. Compared with controls, patients more often had at least one lacune (24.0% versus 4.5%, p<0.0001). In addition, they had a higher WMH-volume (median 1.5 ml (IQR 0.5-3.7) versus 0.4 ml (IQR 0.0-1.0), p<0.001). Compared with controls, patients had the same volume of WMH on average 10-20 years earlier. In the patient group, age at stroke (beta=0.03 (95%CI 0.02-0.04) hypertension (beta=0.22, 95%CI 0.04-0.39) and smoking (beta=0.18, 95%CI 0.01-0.34) at baseline were associated with WMH-volume.
-                                                        Conclusions: Patients with a young stroke have a higher burden of small vessel disease than controls adjusted for confounders. Cerebral aging seems accelerated by 10-20 years in these patients, which may suggest an increased vulnerability to vascular risk factors.},
+                                                          Methods: This prospective cohort study comprises 337 patients with an ischemic stroke or TIA, aged 18-50 years without a history of TIA or stroke. In addition 90 age and sex matched controls were included. At follow-up lacunes, microbleeds and white matter hyperintensitie (WMH) volume were assessed using MRI. To investigate the relation between riks factors and small vessel disease, logistic and linear regression were used.
+                                                          Results: After mean follow-up of 9.9 (SD 8.1) years, 337 patients were included (227 with an ischemic stroke and 110 with a TIA). Mean age for patients was 49.8 (SD 10.3) years and 45.4% were men, for controls mean age was 49.4 (SD 11.9) and 45.6% were men. Compared with controls, patients more often had at least one lacune (24.0% versus 4.5%, p<0.0001). In addition, they had a higher WMH-volume (median 1.5 ml (IQR 0.5-3.7) versus 0.4 ml (IQR 0.0-1.0), p<0.001). Compared with controls, patients had the same volume of WMH on average 10-20 years earlier. In the patient group, age at stroke (beta=0.03 (95%CI 0.02-0.04) hypertension (beta=0.22, 95%CI 0.04-0.39) and smoking (beta=0.18, 95%CI 0.01-0.34) at baseline were associated with WMH-volume.
+                                                          Conclusions: Patients with a young stroke have a higher burden of small vessel disease than controls adjusted for confounders. Cerebral aging seems accelerated by 10-20 years in these patients, which may suggest an increased vulnerability to vascular risk factors.},
   file = {Arnt16.pdf:pdf\\Arnt16.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {27521431},
@@ -1989,12 +1989,12 @@ @conference{Aswo19
   booktitle = {European Congress of Pathology},
   title = {Potential of an AI-based digital biomarker to predict neoadjuvant chemotherapy response from preoperative biopsies of Luminal-B breast cancer},
   abstract = {Background & objectives: Invasive breast cancer (IBC) is increasingly treated with neoadjuvant chemotherapy. Yet, only 15-20% of Luminal-B patients achieve pathological complete response (pCR). We developed an AI-based biomarker to predict pCR of Luminal-B IBC from preoperative biopsies stained with H&E.
-
-                             Methods: First, we trained a deep learning model on a multi-centric dataset of n=277 manually annotated breast cancer H&E-stained histopathology images to segment tumour, lymphocytes and other tissue. Second, we applied the segmentation model to an independent set of n=297 Luminal-B pre-treatment biopsies. For each case, we computed our biomarker: the proportion of tumour within 80mm distance from lymphocyte regions.
-
-                             Results: From the Luminal-B cohort, 32/297 cases (11%) were labelled as "pCR" when no remaining cancer cells were reported for the post-operative surgical resection. The biomarker showed significant (p<<0.01) correlation with pCR with a point biserial correlation coefficient of 0.27. Setting a cut-off value based on the optimal operating point of the ROC curve (AUC=0.69), we reached a sensitivity of 0.53 and a specificity of 0.74.
-
-                             Conclusion: The developed deep-learning based biomarker quantifies the proportion of inflammatory tumour regions. It shows promising results for predicting pCR for Luminal-B breast cancer from pre-treatment biopsies stained with H&E.},
+  
+                               Methods: First, we trained a deep learning model on a multi-centric dataset of n=277 manually annotated breast cancer H&E-stained histopathology images to segment tumour, lymphocytes and other tissue. Second, we applied the segmentation model to an independent set of n=297 Luminal-B pre-treatment biopsies. For each case, we computed our biomarker: the proportion of tumour within 80mm distance from lymphocyte regions.
+  
+                               Results: From the Luminal-B cohort, 32/297 cases (11%) were labelled as "pCR" when no remaining cancer cells were reported for the post-operative surgical resection. The biomarker showed significant (p<<0.01) correlation with pCR with a point biserial correlation coefficient of 0.27. Setting a cut-off value based on the optimal operating point of the ROC curve (AUC=0.69), we reached a sensitivity of 0.53 and a specificity of 0.74.
+  
+                               Conclusion: The developed deep-learning based biomarker quantifies the proportion of inflammatory tumour regions. It shows promising results for predicting pCR for Luminal-B breast cancer from pre-treatment biopsies stained with H&E.},
   optnote = {DIAG, RADIOLOGY},
   year = {2019},
 }
@@ -2020,17 +2020,17 @@ @article{Aswo23
   url = {http://dx.doi.org/10.1186/s13058-023-01726-0},
   volume = {25},
   abstract = {Abstract
-                                        Background
-                                        Invasive breast cancer patients are increasingly being treated with neoadjuvant chemotherapy; however, only a fraction of the patients respond to it completely. To prevent overtreatment, there is an urgent need for biomarkers to predict treatment response before administering the therapy.
-
-                                        Methods
-                                        In this retrospective study, we developed hypothesis-driven interpretable biomarkers based on deep learning, to predict the pathological complete response (pCR, i.e., the absence of tumor cells in the surgical resection specimens) to neoadjuvant chemotherapy solely using digital pathology H&amp;E images of pre-treatment breast biopsies. Our approach consists of two steps: First, we use deep learning to characterize aspects of the tumor micro-environment by detecting mitoses and segmenting tissue into several morphology compartments including tumor, lymphocytes and stroma. Second, we derive computational biomarkers from the segmentation and detection output to encode slide-level relationships of components of the tumor microenvironment, such as tumor and mitoses, stroma, and tumor infiltrating lymphocytes(TILs).
-
-                                        Results
-                                        We developed and evaluated our method on slides from n=721 patients from three European medical centers with triple-negative and Luminal B breast cancers and performed external independent validation on n=126 patients from a public dataset. We report the predictive value of the investigated biomarkers for predicting pCR with areas under the receiver operating characteristic curve between 0.66 and 0.88 across the tested cohorts.
-
-                                        Conclusion
-                                        The proposed computational biomarkers predict pCR, but will require more evaluation and finetuning for clinical application. Our results further corroborate the potential role of deep learning to automate TILs quantification, and their predictive value in breast cancer neoadjuvant treatment planning, along with automated mitoses quantification. We made our method publicly available to extract segmentation-based biomarkers for research purposes.},
+                                          Background
+                                          Invasive breast cancer patients are increasingly being treated with neoadjuvant chemotherapy; however, only a fraction of the patients respond to it completely. To prevent overtreatment, there is an urgent need for biomarkers to predict treatment response before administering the therapy.
+  
+                                          Methods
+                                          In this retrospective study, we developed hypothesis-driven interpretable biomarkers based on deep learning, to predict the pathological complete response (pCR, i.e., the absence of tumor cells in the surgical resection specimens) to neoadjuvant chemotherapy solely using digital pathology H&amp;E images of pre-treatment breast biopsies. Our approach consists of two steps: First, we use deep learning to characterize aspects of the tumor micro-environment by detecting mitoses and segmenting tissue into several morphology compartments including tumor, lymphocytes and stroma. Second, we derive computational biomarkers from the segmentation and detection output to encode slide-level relationships of components of the tumor microenvironment, such as tumor and mitoses, stroma, and tumor infiltrating lymphocytes(TILs).
+  
+                                          Results
+                                          We developed and evaluated our method on slides from n=721 patients from three European medical centers with triple-negative and Luminal B breast cancers and performed external independent validation on n=126 patients from a public dataset. We report the predictive value of the investigated biomarkers for predicting pCR with areas under the receiver operating characteristic curve between 0.66 and 0.88 across the tested cohorts.
+  
+                                          Conclusion
+                                          The proposed computational biomarkers predict pCR, but will require more evaluation and finetuning for clinical application. Our results further corroborate the potential role of deep learning to automate TILs quantification, and their predictive value in breast cancer neoadjuvant treatment planning, along with automated mitoses quantification. We made our method publicly available to extract segmentation-based biomarkers for research purposes.},
   automatic = {yes},
   citation-count = {0},
   file = {Aswo23.pdf:pdf\\Aswo23.pdf:PDF},
@@ -2113,9 +2113,9 @@ @article{Ayat20
   pages = {297-307},
   volume = {15},
   abstract = {Purpose: In this study we propose a new computer-aided diagnosis (CADx) to distinguish between malign and benign mass and non-mass lesions in breast DCE-MRI. For this purpose, we introduce new frequency textural features.
-                                                       Methods: In this paper we propose novel normalized frequency-based features. These are obtained by applying the dual-tree complex wavelet transform to MRI slices containing a lesion for specific decomposition levels. The low-pass and band-pass frequency coefficients of the dual-tree complex wavelet transform represent the general shape and texture features respectively of the lesion. The extraction of these features is computationally efficient. We employ a support vector machine (SVM) to classify the lesions, and investigate modified cost functions and under- and oversampling strategies to handle the class imbalance.
-                                                       Results: The proposed method has been tested on a dataset of 80 patients containing 103 lesions. An area under the curve (AUC) of 0.98 for the mass and 0.94 for the non-mass lesions is obtained. Similarly, accuracies of 96.9% and 89.8%, sensitivities of 93.8% and 84.6% and specificities of 98% and 92.3% are obtained for the mass and non-mass lesions respectively.
-                                                       Conclusions: Normalized frequency-based features can characterize benign and malignant lesions efficiently in both mass and non-mass like lesions. Additionally, the combination of normalized frequency-based features and three dimensional shape descriptors improve the CADx performance.},
+                                                         Methods: In this paper we propose novel normalized frequency-based features. These are obtained by applying the dual-tree complex wavelet transform to MRI slices containing a lesion for specific decomposition levels. The low-pass and band-pass frequency coefficients of the dual-tree complex wavelet transform represent the general shape and texture features respectively of the lesion. The extraction of these features is computationally efficient. We employ a support vector machine (SVM) to classify the lesions, and investigate modified cost functions and under- and oversampling strategies to handle the class imbalance.
+                                                         Results: The proposed method has been tested on a dataset of 80 patients containing 103 lesions. An area under the curve (AUC) of 0.98 for the mass and 0.94 for the non-mass lesions is obtained. Similarly, accuracies of 96.9% and 89.8%, sensitivities of 93.8% and 84.6% and specificities of 98% and 92.3% are obtained for the mass and non-mass lesions respectively.
+                                                         Conclusions: Normalized frequency-based features can characterize benign and malignant lesions efficiently in both mass and non-mass like lesions. Additionally, the combination of normalized frequency-based features and three dimensional shape descriptors improve the CADx performance.},
   file = {Ayat19.pdf:pdf\\Ayat19.pdf:PDF},
   journal = IJCARS,
   optnote = {DIAG, RADIOLOGY},
@@ -2131,30 +2131,30 @@ @mastersthesis{Bagu18
   author = {Ines Correia Bagulho},
   title = {Reference Tissue Normalization of Prostate MRI with automatic Multi-Organ Deep Learning Pelvis segmentation},
   abstract = {Prostate cancer is the most common cancer among male patients and second leading cause of death
-                              from cancer in men (excluding non-melanoma skin cancer). Magnetic Resonance Imaging (MRI) is
-                              currently becoming the modality of choice for clinical staging of localized prostate cancer. However,
-                              MRI lacks intensity quantification which hinders its diagnostic ability. The overall aim of this dissertation
-                              is to automate a novel normalization method that can potentially quantify general MR intensities, thus
-                              improving the diagnostic ability of MRI.
-                              Two Prostate multi-parametric MRI cohorts, of 2012 and 2016, were used in this retrospective study. To
-                              improve the diagnostic ability of T2-Weighted MRI, a novel multi-reference tissue normalization method
-                              was tested and automated. This method consists of computing the average intensity of the referencetissues
-                              and the corresponding normalized reference values to define a look-up-table through interpolation.
-                              Since the method requires delineation of multiple reference tissues, an MRI-specific Deep Learning
-                              model, Aniso-3DUNET, was trained on manual segmentations and tested to automate this segmentation
-                              step. The output of the Deep Learning model, that consisted of automatic segmentations, was validated
-                              and used in an automatic normalization approach. The effect of the manual and automatic normalization
-                              approaches on diagnostic accuracy of T2-weighted intensities was determined with Receiver Operating
-                              Characteristic (ROC) analyses. The Areas Under the Curve (AUC) were compared.
-                              The automatic segmentation of multiple reference-tissues was validated with an average DICE score
-                              higher than 0.8 in the test phase. Thereafter, the method developed demonstrated that the normalized
-                              intensities lead to an improved diagnostic accuracy over raw intensities using the manual approach, with
-                              an AUC going from 0.54 (raw) to 0.68 (normalized), and automatic approach, with an AUC going from
-                              0.68 to 0.73.
-                              This study demonstrates that multi-reference tissue normalization improves quantification of T2-weighted
-                              images and diagnostic accuracy, possibly leading to a decrease in radiologist's interpretation variability.
-                              It is also possible to conclude that this novel T2-weighted MRI normalization method can be automatized,
-                              becoming clinically applicable.},
+                                from cancer in men (excluding non-melanoma skin cancer). Magnetic Resonance Imaging (MRI) is
+                                currently becoming the modality of choice for clinical staging of localized prostate cancer. However,
+                                MRI lacks intensity quantification which hinders its diagnostic ability. The overall aim of this dissertation
+                                is to automate a novel normalization method that can potentially quantify general MR intensities, thus
+                                improving the diagnostic ability of MRI.
+                                Two Prostate multi-parametric MRI cohorts, of 2012 and 2016, were used in this retrospective study. To
+                                improve the diagnostic ability of T2-Weighted MRI, a novel multi-reference tissue normalization method
+                                was tested and automated. This method consists of computing the average intensity of the referencetissues
+                                and the corresponding normalized reference values to define a look-up-table through interpolation.
+                                Since the method requires delineation of multiple reference tissues, an MRI-specific Deep Learning
+                                model, Aniso-3DUNET, was trained on manual segmentations and tested to automate this segmentation
+                                step. The output of the Deep Learning model, that consisted of automatic segmentations, was validated
+                                and used in an automatic normalization approach. The effect of the manual and automatic normalization
+                                approaches on diagnostic accuracy of T2-weighted intensities was determined with Receiver Operating
+                                Characteristic (ROC) analyses. The Areas Under the Curve (AUC) were compared.
+                                The automatic segmentation of multiple reference-tissues was validated with an average DICE score
+                                higher than 0.8 in the test phase. Thereafter, the method developed demonstrated that the normalized
+                                intensities lead to an improved diagnostic accuracy over raw intensities using the manual approach, with
+                                an AUC going from 0.54 (raw) to 0.68 (normalized), and automatic approach, with an AUC going from
+                                0.68 to 0.73.
+                                This study demonstrates that multi-reference tissue normalization improves quantification of T2-weighted
+                                images and diagnostic accuracy, possibly leading to a decrease in radiologist's interpretation variability.
+                                It is also possible to conclude that this novel T2-weighted MRI normalization method can be automatized,
+                                becoming clinically applicable.},
   file = {Bagu18.pdf:pdf/Bagu18.pdf:PDF},
   optnote = {DIAG},
   school = {Universidade De Lisboa},
@@ -2319,13 +2319,13 @@ @phdthesis{Balk20a
   title = {Tissue-based biomarker assessment for predicting prognosis of triple negative breast cancer: the additional value of artificial intelligence},
   url = {https://repository.ubn.ru.nl/handle/2066/220344},
   abstract = {Despite much research, currently still about 1 in 4 patients with TNBC will develop a recurrence after which the survival outlook is very poor. To date, no prognostic markers are available for TNBC to adequately stratify patients for the risk of developing a recurrence. The emergence of powerful computer algorithms, in particular deep learning models, enable more in depth and more extensive biomarker exploration. In addition, these algorithms are objective and reproducible, in contrast to most human visual biomarker assessment. The first aim of this thesis was to establish a well-defined cohort of TNBC, consisting of tissue sections, clinical and pathology data as well as comprehensive follow up data. Secondly, we aimed to evaluate the prognostic value of the mitotic count, which has widespread clinical use as part of the Nottingham grading system. We studied mitotic count both via conventional manual assessment and automatic assessment, to see if we could find a cut-off value which is better tailored for TNBC. Our third aim was to evaluate the prognostic value of TILs, a promising biomarker not yet used in clinical practice.
-
-                               To study the prognostic value of biomarkers in TNBC, the following objectives were defined:
-                               1.	Establish a multicentre TNBC cohort including tissue sections and follow up data (Chapter 2)
-                               2.	Develop a baseline prognostic model for TNBC based on the currently known clinicopathological variables (Chapter 2)
-                               3.	Establish a computer algorithm (Chapter 3) which can automatically find mitoses in WSI of breast cancer, and validate the algorithm (Chapter 4)
-                               4.	Explore the prognostic value of the mitotic count for TNBC using manual and automatic assessment (Chapter 5)
-                               5.	Optimize the assessment of tumour infiltrating lymphocytes using deep learning and study its prognostic value in TNBC (Chapter 6)},
+  
+                                 To study the prognostic value of biomarkers in TNBC, the following objectives were defined:
+                                 1.	Establish a multicentre TNBC cohort including tissue sections and follow up data (Chapter 2)
+                                 2.	Develop a baseline prognostic model for TNBC based on the currently known clinicopathological variables (Chapter 2)
+                                 3.	Establish a computer algorithm (Chapter 3) which can automatically find mitoses in WSI of breast cancer, and validate the algorithm (Chapter 4)
+                                 4.	Explore the prognostic value of the mitotic count for TNBC using manual and automatic assessment (Chapter 5)
+                                 5.	Optimize the assessment of tumour infiltrating lymphocytes using deep learning and study its prognostic value in TNBC (Chapter 6)},
   copromotor = {P. Bult and F. Ciompi},
   file = {Balk20a.pdf:pdf\\Balk20.pdf:PDF},
   optnote = {DIAG},
@@ -2340,11 +2340,11 @@ @conference{Balk20b
   booktitle = {European Journal of Cancer},
   title = {Deep learning enables fully automated mitotic density assessment in breast cancer histopathology},
   abstract = {Background: Mitosis counting is an important part of breast cancer grading, yet known to suffer from observer variability. Advances in machine learning enable fully automated analysis of digitized glass slides. The present study evaluated automatic mitosis counting and demonstrated applicability on triple negative breast cancers (TNBC).
-                          Material and Methods: In entire scanned H&E slides of 90 invasive breast tumours, a deep learning algorithm (DLA) fully automatically detected all mitoses and determined the hotspot (area with highest mitotic density). Subsequently, two independent observers assessed mitotic counts on glass slides according to routine practice, and in the computer-defined hotspot.
-                          Next, automated mitotic counting was performed in our TNBC cohort (n = 597). Multivariable Cox regression survival models were expanded with dichotomized mitotic counts. The c-statistic was used to evaluate the additional prognostic value of every possible cut off value.
-                          Results: Automatic counting showed excellent concordance with visual assessment in computer detected hotspots with intraclass correlation coefficients (ICC) of 0.895 (95% CI 0.845-0.930) and 0.888 (95% CI 0.783-0.936) for two observers, respectively. ICC of fully automated counting versus conventional glass slide assessment were 0.828 (95% CI 0.750-0.883 and 0.757 (95% CI 0.638-0.839), respectively.
-                          In the TNBC cohort, automatic mitotic counts ranged from 1 to 269 (mean 57.6) in 2 mm2 hotspots. None of the cut off values improved the models' baseline c-statistic.
-                          Conclusion: Automatic mitosis counting is a promising complementary aid for mitoses assessment. Our method was capable of fully automatically locating the mitotic hotspot in tumours, and was capable of processing a large series of TNBC, showing that mitotic count was not prognostic for TNBC even when attempting alternative cut off points.},
+                            Material and Methods: In entire scanned H&E slides of 90 invasive breast tumours, a deep learning algorithm (DLA) fully automatically detected all mitoses and determined the hotspot (area with highest mitotic density). Subsequently, two independent observers assessed mitotic counts on glass slides according to routine practice, and in the computer-defined hotspot.
+                            Next, automated mitotic counting was performed in our TNBC cohort (n = 597). Multivariable Cox regression survival models were expanded with dichotomized mitotic counts. The c-statistic was used to evaluate the additional prognostic value of every possible cut off value.
+                            Results: Automatic counting showed excellent concordance with visual assessment in computer detected hotspots with intraclass correlation coefficients (ICC) of 0.895 (95% CI 0.845-0.930) and 0.888 (95% CI 0.783-0.936) for two observers, respectively. ICC of fully automated counting versus conventional glass slide assessment were 0.828 (95% CI 0.750-0.883 and 0.757 (95% CI 0.638-0.839), respectively.
+                            In the TNBC cohort, automatic mitotic counts ranged from 1 to 269 (mean 57.6) in 2 mm2 hotspots. None of the cut off values improved the models' baseline c-statistic.
+                            Conclusion: Automatic mitosis counting is a promising complementary aid for mitoses assessment. Our method was capable of fully automatically locating the mitotic hotspot in tumours, and was capable of processing a large series of TNBC, showing that mitotic count was not prognostic for TNBC even when attempting alternative cut off points.},
   optnote = {DIAG, RADIOLOGY},
   year = {2020},
 }
@@ -2589,8 +2589,8 @@ @article{Band19a
   url = {https://peerj.com/articles/8242/},
   volume = {7},
   abstract = {Modern pathology diagnostics is being driven toward large scale digitization of microscopic tissue sections. A prerequisite for its safe implementation is the guarantee that all tissue present on a glass slide can also be found back in the digital image. Whole-slide scanners perform a tissue segmentation in a low resolution overview image to prevent inefficient high-resolution scanning of empty background areas. However, currently applied algorithms can fail in detecting all tissue regions.
-                                                       In this study, we developed convolutional neural networks to distinguish tissue from background. We collected 100 whole-slide images of 10 tissue samples--staining categories from five medical centers for development and testing. Additionally, eight more images of eight unfamiliar categories were collected for testing only. We compared our fully-convolutional neural networks to three traditional methods on a range of resolution levels using Dice score and sensitivity.
-                                                       We also tested whether a single neural network can perform equivalently to multiple networks, each specialized in a single resolution. Overall, our solutions outperformed the traditional methods on all the tested resolutions. The resolution-agnostic network achieved average Dice scores between 0.97 and 0.98 across the tested resolution levels, only 0.0069 less than the resolution-specific networks. Finally, its excellent generalization performance was demonstrated by achieving averages of 0.98 Dice score and 0.97 sensitivity on the eight unfamiliar images. A future study should test this network prospectively.},
+                                                         In this study, we developed convolutional neural networks to distinguish tissue from background. We collected 100 whole-slide images of 10 tissue samples--staining categories from five medical centers for development and testing. Additionally, eight more images of eight unfamiliar categories were collected for testing only. We compared our fully-convolutional neural networks to three traditional methods on a range of resolution levels using Dice score and sensitivity.
+                                                         We also tested whether a single neural network can perform equivalently to multiple networks, each specialized in a single resolution. Overall, our solutions outperformed the traditional methods on all the tested resolutions. The resolution-agnostic network achieved average Dice scores between 0.97 and 0.98 across the tested resolution levels, only 0.0069 less than the resolution-specific networks. Finally, its excellent generalization performance was demonstrated by achieving averages of 0.98 Dice score and 0.97 sensitivity on the eight unfamiliar images. A future study should test this network prospectively.},
   file = {Band19a.pdf:pdf\\Band19a.pdf:PDF},
   journal = PRJ,
   optnote = {DIAG, RADIOLOGY},
@@ -2610,8 +2610,8 @@ @article{Band23
   pages = {102755},
   volume = {85},
   abstract = {Recently, large, high-quality public datasets have led to the development of convolutional neural networks that can detect lymph node metastases of breast cancer at the level of expert pathologists. Many cancers, regardless of the site of origin, can metastasize to lymph nodes. However, collecting and annotating high-volume, high-quality datasets for every cancer type is challenging. In this paper we investigate how to leverage existing high-quality datasets most efficiently in multi-task settings for closely related tasks. Specifically, we will explore different training and domain adaptation strategies, including prevention of catastrophic forgetting, for breast, colon and head-and-neck cancer metastasis detection in lymph nodes.
-
-                          Our results show state-of-the-art performance on colon and head-and-neck cancer metastasis detection tasks. We show the effectiveness of adaptation of networks from one cancer type to another to obtain multi-task metastasis detection networks. Furthermore, we show that leveraging existing high-quality datasets can significantly boost performance on new target tasks and that catastrophic forgetting can be effectively mitigated. Last, we compare different mitigation strategies.},
+  
+                            Our results show state-of-the-art performance on colon and head-and-neck cancer metastasis detection tasks. We show the effectiveness of adaptation of networks from one cancer type to another to obtain multi-task metastasis detection networks. Furthermore, we show that leveraging existing high-quality datasets can significantly boost performance on new target tasks and that catastrophic forgetting can be effectively mitigated. Last, we compare different mitigation strategies.},
   file = {Band23.pdf:pdf\\Band23.pdf:PDF},
   journal = {Medical Image Analysis},
   optnote = {DIAG, PATHOLOGY, RADIOLOGY},
@@ -2749,14 +2749,14 @@ @conference{Beck16
   booktitle = RSNA,
   year = {2016},
   abstract = {PURPOSE: We aimed to evaluate the additional value of brain {CT} perfusion ({CTP}) for intracranial vessel occlusion detection in acute ischemic stroke for observers with different levels of experience.
-
-                                                       METHOD AND MATERIALS: We retrospectively included all patients with symptoms of acute ischemic stroke (onset of less than 9 hours) who were scanned with non-enhanced {CT} ({NECT}), {CT} angiography ({CTA}) and {CTP} in the year 2015. Four observers with different levels of experience (neuroradiologist, non-neuroradiologist, two radiology residents) evaluated the imaging data with 2 imaging strategies. Method 1 included {NECT} and {CTA}. For method 2, additional {CTP} maps were provided for the evaluation of intracranial vessel occlusion on {CTA}. The observers were blinded to patient identity and clinical outcome. Receiver operating characteristic ({ROC}) was used for the evaluation of accuracy in intracranial vessel occlusion detection. The reference standard of vessel occlusion was set based on the evaluation by the four observers, and the judgment of an independent neuroradiologist serving as a referee in case of discrepancy.
-
-                                                       RESULTS: In total 110 patients were included, preliminary analyses included 94 patients. There was an increase of {AUC} in the overall detection of intracranial vessel occlusion for observer 1, 3 and 4, though only for observer 1 the increase in {AUC} was statistically significant (p=0.041). Increase of intracranial vessel occlusion detection mainly concerned distal vessel occlusions. No significant added value of {CTP} was found for proximal vessel occlusions, with already a high accuracy based on {NECT} and {CTA} for all experience levels with sensitivity ranging between 86-94% and specificity between 92-100%.
-
-                                                       CONCLUSION: Our study demonstrates that the use of {CTP} can aid in the detection of distal intracranial vessel occlusions on {CTA} in case {CTP} is integrated in the reading strategy. It is also demonstrated that {CTP} was not of added value for the detection of proximal intracranial vessel occlusions. Finally, there was no major difference in the diagnostic accuracy of intracranial vessel occlusion detection for the different levels in experience of the observers.
-
-                                                       CLINICAL RELEVANCE/APPLICATION: Our study demonstrated that brain {CT} perfusion can aid in the detection of distal intracranial vessel occlusions, which is clinically relevant for optimizing the imaging strategy in acute ischemic stroke.},
+  
+                                                         METHOD AND MATERIALS: We retrospectively included all patients with symptoms of acute ischemic stroke (onset of less than 9 hours) who were scanned with non-enhanced {CT} ({NECT}), {CT} angiography ({CTA}) and {CTP} in the year 2015. Four observers with different levels of experience (neuroradiologist, non-neuroradiologist, two radiology residents) evaluated the imaging data with 2 imaging strategies. Method 1 included {NECT} and {CTA}. For method 2, additional {CTP} maps were provided for the evaluation of intracranial vessel occlusion on {CTA}. The observers were blinded to patient identity and clinical outcome. Receiver operating characteristic ({ROC}) was used for the evaluation of accuracy in intracranial vessel occlusion detection. The reference standard of vessel occlusion was set based on the evaluation by the four observers, and the judgment of an independent neuroradiologist serving as a referee in case of discrepancy.
+  
+                                                         RESULTS: In total 110 patients were included, preliminary analyses included 94 patients. There was an increase of {AUC} in the overall detection of intracranial vessel occlusion for observer 1, 3 and 4, though only for observer 1 the increase in {AUC} was statistically significant (p=0.041). Increase of intracranial vessel occlusion detection mainly concerned distal vessel occlusions. No significant added value of {CTP} was found for proximal vessel occlusions, with already a high accuracy based on {NECT} and {CTA} for all experience levels with sensitivity ranging between 86-94% and specificity between 92-100%.
+  
+                                                         CONCLUSION: Our study demonstrates that the use of {CTP} can aid in the detection of distal intracranial vessel occlusions on {CTA} in case {CTP} is integrated in the reading strategy. It is also demonstrated that {CTP} was not of added value for the detection of proximal intracranial vessel occlusions. Finally, there was no major difference in the diagnostic accuracy of intracranial vessel occlusion detection for the different levels in experience of the observers.
+  
+                                                         CLINICAL RELEVANCE/APPLICATION: Our study demonstrated that brain {CT} perfusion can aid in the detection of distal intracranial vessel occlusions, which is clinically relevant for optimizing the imaging strategy in acute ischemic stroke.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -2770,12 +2770,12 @@ @article{Beck19a
   pages = {124-129},
   doi = {10.1016/j.neurad.2018.03.003},
   abstract = {Background and purpose: To evaluate whether brain CT perfusion (CTP) aids in the detection of intracranial vessel occlusion on CT angiography (CTA) in acute ischemic stroke.
-
-                                                       Materials and methods: Medical-ethical committee approval of our hospital was obtained and informed consent was waived. Patients suspected of acute ischemic stroke who underwent non-contrast CT(NCCT), CTA and whole-brain CTP in our center in the year 2015 were included. Three observers with different levels of experience evaluated the imaging data of 110 patients for the presence or absence of intracranial arterial vessel occlusion with two strategies. In the first strategy, only NCCT and CTA were available. In the second strategy, CTP maps were provided in addition to NCCT and CTA. Receiver-operating-characteristic (ROC) analysis was used for the evaluation of diagnostic accuracy.
-
-                                                       Results: Overall, a brain perfusion deficit was scored present in 87-89% of the patients with an intracranial vessel occlusion, more frequently observed in the anterior than in the posterior circulation. Performance of intracranial vessel occlusion detection on CTA was significantly improved with the availability of CTP maps as compared to the first strategy (P = 0.023), due to improved detection of distal and posterior circulation vessel occlusions (P-values of 0.032 and 0.003 respectively). No added value of CTP was found for intracranial proximal vessel occlusion detection, with already high accuracy based on NCCT and CTA alone.
-
-                                                       Conclusion: The performance of intracranial vessel occlusion detection on CTA was improved with the availability of brain CT perfusion maps due to the improved detection of distal and posterior circulation vessel occlusions.},
+  
+                                                         Materials and methods: Medical-ethical committee approval of our hospital was obtained and informed consent was waived. Patients suspected of acute ischemic stroke who underwent non-contrast CT(NCCT), CTA and whole-brain CTP in our center in the year 2015 were included. Three observers with different levels of experience evaluated the imaging data of 110 patients for the presence or absence of intracranial arterial vessel occlusion with two strategies. In the first strategy, only NCCT and CTA were available. In the second strategy, CTP maps were provided in addition to NCCT and CTA. Receiver-operating-characteristic (ROC) analysis was used for the evaluation of diagnostic accuracy.
+  
+                                                         Results: Overall, a brain perfusion deficit was scored present in 87-89% of the patients with an intracranial vessel occlusion, more frequently observed in the anterior than in the posterior circulation. Performance of intracranial vessel occlusion detection on CTA was significantly improved with the availability of CTP maps as compared to the first strategy (P = 0.023), due to improved detection of distal and posterior circulation vessel occlusions (P-values of 0.032 and 0.003 respectively). No added value of CTP was found for intracranial proximal vessel occlusion detection, with already high accuracy based on NCCT and CTA alone.
+  
+                                                         Conclusion: The performance of intracranial vessel occlusion detection on CTA was improved with the availability of brain CT perfusion maps due to the improved detection of distal and posterior circulation vessel occlusions.},
   file = {pdf\\Beck19a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {29625153},
@@ -2814,7 +2814,7 @@ @book{Beic16
   publisher = {CreateSpace.com},
   url = {http://www.amazon.com/Sixth-International-Workshop-Pulmonary-Analysis/dp/1537038583},
   abstract = {These are the proceedings of the sixth edition of the International Workshop on Pulmonary Image Analysis, held in conjunction with the Medical Image Computing and Computer Assisted Intervention (MICCAI) Conference 2016 in Athens, Greece. The International Workshop on Pulmonary Image Analysis brings together researchers in pulmonary image analysis to discuss new developments in the field. For the sixth edition of the workshop, all submitted papers received thorough reviews by at least three reviewers. In total, eight papers were accepted for presentation at the workshop, of which five were selected for oral presentation and three for poster presentation. The presented papers deal with di
-                                                       erent aspects of pulmonary image analysis, including computer aided diagnosis, segmentation, and registration. We would like to thank the organizers of MICCAI 2016 for hosting the sixth edition of the International Workshop on Pulmonary Image Analysis and for handling the logistics of the workshop, and all colleagues involved in the peer-review process.},
+                                                         erent aspects of pulmonary image analysis, including computer aided diagnosis, segmentation, and registration. We would like to thank the organizers of MICCAI 2016 for hosting the sixth edition of the International Workshop on Pulmonary Image Analysis and for handling the logistics of the workshop, and all colleagues involved in the peer-review process.},
   file = {Beic16.pdf:pdf/Beic16.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   year = {2016},
@@ -2894,11 +2894,11 @@ @phdthesis{Bejn17a
   title = {Histopathological diagnosis of breast cancer using machine learning},
   url = {https://repository.ubn.ru.nl/handle/2066/178907},
   abstract = {Application of machine learning to WSI is a promising yet largely unexplored field of research. The primary aim of the research described in this thesis was to develop automated systems for analysis of H&E stained breast histopathological images. This involved automatic detection of ductal carcinoma in-situ (DCIS), invasive, and metastatic breast cancer in whole-slide histopathological images. A secondary aim was to identify new diagnostic biomarkers for the detection of invasive breast cancer. To this end the research was undertaken with the following objectives:
-
-                              1. Development of an algorithm for standardization of H&E stained WSIs;
-                              2. Detection, classification and segmentation of primary breast cancer;
-                              3. Evaluation of the state of the art of machine learning algorithms for automatic detection of lymph nodes metastases;
-                              4. Identifying and leveraging new stromal biomarkers to improve breast cancer diagnostics.},
+  
+                                1. Development of an algorithm for standardization of H&E stained WSIs;
+                                2. Detection, classification and segmentation of primary breast cancer;
+                                3. Evaluation of the state of the art of machine learning algorithms for automatic detection of lymph nodes metastases;
+                                4. Identifying and leveraging new stromal biomarkers to improve breast cancer diagnostics.},
   copromotor = {J.A.W.M. van der Laak and G. Litjens},
   file = {Bejn17a.pdf:pdf\\Bejn17a.pdf:PDF},
   optnote = {DIAG},
@@ -3254,16 +3254,16 @@ @article{Blek19
   month = {9},
   doi = {https://doi.org/10.1007/s00330-019-06488-y},
   abstract = {Objectives
-                                                       To create a radiomics approach based on multiparametric magnetic resonance imaging (mpMRI) features extracted from an auto-fixed volume of interest (VOI) that quantifies the phenotype of clinically significant (CS) peripheral zone (PZ) prostate cancer (PCa).
-
-                                                       Methods
-                                                       This study included 206 patients with 262 prospectively called mpMRI prostate imaging reporting and data system 3-5 PZ lesions. Gleason scores > 6 were defined as CS PCa. Features were extracted with an auto-fixed 12-mm spherical VOI placed around a pin point in each lesion. The value of dynamic contrast-enhanced imaging(DCE), multivariate feature selection and extreme gradient boosting (XGB) vs. univariate feature selection and random forest (RF), expert-based feature pre-selection, and the addition of image filters was investigated using the training (171 lesions) and test (91 lesions) datasets.
-
-                                                       Results
-                                                       The best model with features from T2-weighted (T2-w) + diffusion-weighted imaging (DWI) + DCE had an area under the curve (AUC) of 0.870 (95% CI 0.980-0.754). Removal of DCE features decreased AUC to 0.816 (95% CI 0.920-0.710), although not significantly (p = 0.119). Multivariate and XGB outperformed univariate and RF (p = 0.028). Expert-based feature pre-selection and image filters had no significant contribution.
-
-                                                       Conclusions
-                                                       The phenotype of CS PZ PCa lesions can be quantified using a radiomics approach based on features extracted from T2-w + DWI using an auto-fixed VOI. Although DCE features improve diagnostic performance, this is not statistically significant. Multivariate feature selection and XGB should be preferred over univariate feature selection and RF. The developed model may be a valuable addition to traditional visual assessment in diagnosing CS PZ PCa.},
+                                                         To create a radiomics approach based on multiparametric magnetic resonance imaging (mpMRI) features extracted from an auto-fixed volume of interest (VOI) that quantifies the phenotype of clinically significant (CS) peripheral zone (PZ) prostate cancer (PCa).
+  
+                                                         Methods
+                                                         This study included 206 patients with 262 prospectively called mpMRI prostate imaging reporting and data system 3-5 PZ lesions. Gleason scores > 6 were defined as CS PCa. Features were extracted with an auto-fixed 12-mm spherical VOI placed around a pin point in each lesion. The value of dynamic contrast-enhanced imaging(DCE), multivariate feature selection and extreme gradient boosting (XGB) vs. univariate feature selection and random forest (RF), expert-based feature pre-selection, and the addition of image filters was investigated using the training (171 lesions) and test (91 lesions) datasets.
+  
+                                                         Results
+                                                         The best model with features from T2-weighted (T2-w) + diffusion-weighted imaging (DWI) + DCE had an area under the curve (AUC) of 0.870 (95% CI 0.980-0.754). Removal of DCE features decreased AUC to 0.816 (95% CI 0.920-0.710), although not significantly (p = 0.119). Multivariate and XGB outperformed univariate and RF (p = 0.028). Expert-based feature pre-selection and image filters had no significant contribution.
+  
+                                                         Conclusions
+                                                         The phenotype of CS PZ PCa lesions can be quantified using a radiomics approach based on features extracted from T2-w + DWI using an auto-fixed VOI. Although DCE features improve diagnostic performance, this is not statistically significant. Multivariate feature selection and XGB should be preferred over univariate feature selection and RF. The developed model may be a valuable addition to traditional visual assessment in diagnosing CS PZ PCa.},
   file = {:pdf/Blek19.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {31776744},
@@ -3283,16 +3283,16 @@ @article{Blek21
   doi = {https://doi.org/10.1186/s13244-021-01099-y},
   optnote = {DIAG, RADIOLOGY},
   abstract = {Objectives
-                             To investigate a previously developed radiomics-based biparametric magnetic resonance imaging (bpMRI) approach for discrimination of clinically significant peripheral zone prostate cancer (PZ csPCa) using multi-center, multi-vendor (McMv) and single-center, single-vendor (ScSv) datasets.
-
-                             Methods
-                             This study's starting point was a previously developed ScSv algorithm for PZ csPCa whose performance was demonstrated in a single-center dataset. A McMv dataset was collected, and 262 PZ PCa lesions (9 centers, 2 vendors) were selected to identically develop a multi-center algorithm. The single-center algorithm was then applied to the multi-center dataset (single-multi-validation), and the McMv algorithm was applied to both the multi-center dataset (multi-multi-validation) and the previously used single-center dataset (multi-single-validation). The areas under the curve (AUCs) of the validations were compared using bootstrapping.
-
-                             Results
-                             Previously the single-single validation achieved an AUC of 0.82 (95% CI 0.71-0.92), a significant performance reduction of 27.2% compared to the single-multi-validation AUC of 0.59 (95% CI 0.51-0.68). The new multi-center model achieved a multi-multi-validation AUC of 0.75 (95% CI 0.64-0.84). Compared to the multi-single-validation AUC of 0.66 (95% CI 0.56-0.75), the performance did not decrease significantly (p value: 0.114). Bootstrapped comparison showed similar single-center performances and a significantly different multi-center performance (p values: 0.03, 0.012).
-
-                             Conclusions
-                             A single-center trained radiomics-based bpMRI model does not generalize to multi-center data. Multi-center trained radiomics-based bpMRI models do generalize, have equal single-center performance and perform better on multi-center data.},
+                               To investigate a previously developed radiomics-based biparametric magnetic resonance imaging (bpMRI) approach for discrimination of clinically significant peripheral zone prostate cancer (PZ csPCa) using multi-center, multi-vendor (McMv) and single-center, single-vendor (ScSv) datasets.
+  
+                               Methods
+                               This study's starting point was a previously developed ScSv algorithm for PZ csPCa whose performance was demonstrated in a single-center dataset. A McMv dataset was collected, and 262 PZ PCa lesions (9 centers, 2 vendors) were selected to identically develop a multi-center algorithm. The single-center algorithm was then applied to the multi-center dataset (single-multi-validation), and the McMv algorithm was applied to both the multi-center dataset (multi-multi-validation) and the previously used single-center dataset (multi-single-validation). The areas under the curve (AUCs) of the validations were compared using bootstrapping.
+  
+                               Results
+                               Previously the single-single validation achieved an AUC of 0.82 (95% CI 0.71-0.92), a significant performance reduction of 27.2% compared to the single-multi-validation AUC of 0.59 (95% CI 0.51-0.68). The new multi-center model achieved a multi-multi-validation AUC of 0.75 (95% CI 0.64-0.84). Compared to the multi-single-validation AUC of 0.66 (95% CI 0.56-0.75), the performance did not decrease significantly (p value: 0.114). Bootstrapped comparison showed similar single-center performances and a significantly different multi-center performance (p values: 0.03, 0.012).
+  
+                               Conclusions
+                               A single-center trained radiomics-based bpMRI model does not generalize to multi-center data. Multi-center trained radiomics-based bpMRI models do generalize, have equal single-center performance and perform better on multi-center data.},
   taverne_url = {https://repository.ubn.ru.nl/handle/2066/239809},
   ss_id = {3d130766be579a65496f87ec07f51123206fe131},
   all_ss_ids = {['3d130766be579a65496f87ec07f51123206fe131']},
@@ -3300,15 +3300,38 @@ @article{Blek21
 }
 
 @article{Blek22,
-  author = {Bleker, Jeroen and Kwee, Thomas C and Rouw, Dennis and Roest, Christian and Borstlap, Jaap and de Jong, Igle Jan and Dierckx, Rudi AJO and Huisman, Henkjan and Yakar, Derya},
+  author = {Bleker, Jeroen and Kwee, Thomas C. and Rouw, Dennis and Roest, Christian and Borstlap, Jaap and de Jong, Igle Jan and Dierckx, Rudi A. J. O. and Huisman, Henkjan and Yakar, Derya},
   title = {A deep learning masked segmentation alternative to manual segmentation in biparametric MRI prostate cancer radiomics},
-  journal = {European Radiology},
-  pages = {1--10},
+  doi = {10.1007/s00330-022-08712-8},
   year = {2022},
-  publisher = {Springer},
-  ss_id = {9d6b9f6c203c73af662853a1320659a62eb9be4b},
+  abstract = {Abstract
+                 Objectives
+                 To determine the value of a deep learning masked (DLM) auto-fixed volume of interest (VOI) segmentation method as an alternative to manual segmentation for radiomics-based diagnosis of clinically significant (CS) prostate cancer (PCa) on biparametric magnetic resonance imaging (bpMRI).
+               
+                 Materials and methods
+                 This study included a retrospective multi-center dataset of 524 PCa lesions (of which 204 are CS PCa) on bpMRI. All lesions were both semi-automatically segmented with a DLM auto-fixed VOI method (averaging &lt; 10 s per lesion) and manually segmented by an expert uroradiologist (averaging 5 min per lesion). The DLM auto-fixed VOI method uses a spherical VOI (with its center at the location of the lowest apparent diffusion coefficient of the prostate lesion as indicated with a single mouse click) from which non-prostate voxels are removed using a deep learning-based prostate segmentation algorithm. Thirteen different DLM auto-fixed VOI diameters (ranging from 6 to 30 mm) were explored. Extracted radiomics data were split into training and test sets (4:1 ratio). Performance was assessed with receiver operating characteristic (ROC) analysis.
+               
+                 Results
+                 In the test set, the area under the ROC curve (AUCs) of the DLM auto-fixed VOI method with a VOI diameter of 18 mm (0.76 [95% CI: 0.66-0.85]) was significantly higher (p = 0.0198) than that of the manual segmentation method (0.62 [95% CI: 0.52-0.73]).
+               
+                 Conclusions
+                 A DLM auto-fixed VOI segmentation can provide a potentially more accurate radiomics diagnosis of CS PCa than expert manual segmentation while also reducing expert time investment by more than 97%.
+               
+                 Key Points
+                 * Compared to traditional expert-based segmentation, a deep learning mask (DLM) auto-fixed VOI placement is more accurate at detecting CS PCa.
+                 * Compared to traditional expert-based segmentation, a DLM auto-fixed VOI placement is faster and can result in a 97% time reduction.
+                 * Applying deep learning to an auto-fixed VOI radiomics approach can be valuable.
+               },
+  url = {http://dx.doi.org/10.1007/s00330-022-08712-8},
+  file = {Blek22.pdf:pdf\\Blek22.pdf:PDF},
+  optnote = {DIAG, RADIOLOGY},
+  journal = {European Radiology},
+  automatic = {yes},
   all_ss_ids = {['9d6b9f6c203c73af662853a1320659a62eb9be4b']},
-  gscites = {10},
+  citation-count = {14},
+  pages = {6526-6535},
+  volume = {32},
+  pmid = {35420303},
 }
 
 @article{Blek23,
@@ -3431,8 +3454,8 @@ @mastersthesis{Boer2020
   author = {Tristan de Boer},
   title = {A feasibility study for Deep Learning Image Guided Guidewire Tracking for Image-guided Interventions},
   abstract = {A feasibility study for Deep Learning Image Guided Guidewire Tracking for Image-guided Interventions
-                                                      A current challenge in real-time magnetic resonance imaging (MRI) guided minimally invasive images is needle tracking and planning. We propose a pipeline for automatic object detection using a state-of-the-art object detection network. Predictions by the object detection network were used to translate the MRI plane to keep a guidewire tip in a plane. We evaluated the pipeline on displacement error between the prediction and the actual location of the guidewire tip in a setup with an anthropomorphic blood vessel. For this setup, we hypothesized that the network should be able to correctly predict the actual location within a margin of 10 mm, at least within 1000 ms.
-                                                      Results show that the pipeline can accurately track the guidewire tip in real-time (within 458 ms), with a mean displacement error of 7 mm (s = 4). Based on this evidence, we have demonstrated the feasibility of deep learning assisted image-guided interventions, creating possibilities for other deep learning guided interventions. Our proposed method shows potential for cryoablation. During these types of minimally invasive procedures tracking needles can be a challenge.},
+                                                        A current challenge in real-time magnetic resonance imaging (MRI) guided minimally invasive images is needle tracking and planning. We propose a pipeline for automatic object detection using a state-of-the-art object detection network. Predictions by the object detection network were used to translate the MRI plane to keep a guidewire tip in a plane. We evaluated the pipeline on displacement error between the prediction and the actual location of the guidewire tip in a setup with an anthropomorphic blood vessel. For this setup, we hypothesized that the network should be able to correctly predict the actual location within a margin of 10 mm, at least within 1000 ms.
+                                                        Results show that the pipeline can accurately track the guidewire tip in real-time (within 458 ms), with a mean displacement error of 7 mm (s = 4). Based on this evidence, we have demonstrated the feasibility of deep learning assisted image-guided interventions, creating possibilities for other deep learning guided interventions. Our proposed method shows potential for cryoablation. During these types of minimally invasive procedures tracking needles can be a challenge.},
   file = {:pdf/Boer20a.pdf:PDF},
   optnote = {DIAG},
   school = {Radboud University},
@@ -3558,7 +3581,7 @@ @inproceedings{Bokh19
   pages = {81-94},
   url = {http://proceedings.mlr.press/v102/bokhorst19a.html},
   abstract = {We investigate the problem of building convolutional networks for semantic segmentation in histopathology images when weak supervision in the form of sparse manual annotations is provided in the training set. We propose to address this problem by modifying the loss function in order to balance the contribution of each pixel of the input data. We introduce and compare two approaches of loss balancing when sparse annotations are provided, namely (1) instance based balancing and (2) mini-batch based balancing. We also consider a scenario of full supervision in the form of dense annotations, and compare the performance of using either sparse or dense annotations with the proposed balancing schemes. Finally, we show that using a bulk of sparse annotations and a
-                                                       small fraction of dense annotations allows to achieve performance comparable to full supervision.},
+                                                         small fraction of dense annotations allows to achieve performance comparable to full supervision.},
   file = {Bokh19.pdf:pdf\\Bokh19.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   gsid = {9237806221216165855},
@@ -3598,16 +3621,16 @@ @conference{Bokh20
   booktitle = ECP,
   title = {Computer-assisted hot-spot selection for tumor budding assessment in colorectal cancer},
   abstract = {Background & objectives
-                                                      Tumor budding (TB) is an established prognosticator for colorectal cancer. Detection of the hot-spot to score TB is based on visual inspection, hindering reproducibility of this important factor. We present an algorithm that can potentially assist pathologists in this task.
-
-                                                      Methods
-                                                      We used a previously developed algorithm for the detection of tumor buds in pan-cytokeratin stained whole slide images, calculating the number of buds for each location using a circle with 0.785mm2 surface area. From these numbers, density heatmaps were produced. The algorithm was applied to 270 slides from Bern University hospital, in which hot-spots and tumor buds were visually identified.
-
-                                                      Results
-                                                      Heat maps were created and we located the hand-selected hotspot and noted the associated TB number. The differences and similarities between computer identified and manually selected hot-spots were visually assessed as well as via histograms. Preliminary results show that the heatmaps are helpful, as locations with the highest TB density (the top 15%) also include the hand-selected hotspots. The full results will be presented during the conference.
-
-                                                      Conclusion
-                                                      The presented algorithm can assist the pathologist in selecting the hot-spot with the highest tumor bud count with more ease at low magnification and can help to reduce the high interobserver variability among pathologists in scoring tumor budding.},
+                                                        Tumor budding (TB) is an established prognosticator for colorectal cancer. Detection of the hot-spot to score TB is based on visual inspection, hindering reproducibility of this important factor. We present an algorithm that can potentially assist pathologists in this task.
+  
+                                                        Methods
+                                                        We used a previously developed algorithm for the detection of tumor buds in pan-cytokeratin stained whole slide images, calculating the number of buds for each location using a circle with 0.785mm2 surface area. From these numbers, density heatmaps were produced. The algorithm was applied to 270 slides from Bern University hospital, in which hot-spots and tumor buds were visually identified.
+  
+                                                        Results
+                                                        Heat maps were created and we located the hand-selected hotspot and noted the associated TB number. The differences and similarities between computer identified and manually selected hot-spots were visually assessed as well as via histograms. Preliminary results show that the heatmaps are helpful, as locations with the highest TB density (the top 15%) also include the hand-selected hotspots. The full results will be presented during the conference.
+  
+                                                        Conclusion
+                                                        The presented algorithm can assist the pathologist in selecting the hot-spot with the highest tumor bud count with more ease at low magnification and can help to reduce the high interobserver variability among pathologists in scoring tumor budding.},
   optnote = {DIAG},
   year = {2020},
 }
@@ -3617,16 +3640,16 @@ @conference{Bokh20a
   booktitle = ECP,
   title = {Deep learning based tumor bud detection in pan-cytokeratin stained colorectal cancer whole-slide images},
   abstract = {Background & objectives
-                                                      Tumor budding (TB) is an established prognosticator for colorectal cancer. Deep learning based TB assessment has the potential to improve diagnostic reproducibility and efficiency.  We developed an algorithm that can detect individual tumor buds in pan-cytokeratin stained colorectal cancer slides
-
-                                                      Methods
-                                                      Tumor-bud candidates (n=1765, collected from 58 whole slide images; WSI) were labeled by seven experts as either TB, poorly differentiated cluster, or neither. The 58 slides were randomly split into a training (49) and test-set (9). A deep learning (DL) model was trained using the buds identified by the experts in the training set.
-
-                                                      Results
-                                                      The algorithm was tested on the nine remaining WSI and 270 WSI from pan-cytokeratin stained slides from Bern University hospital, in which hot spots and TB were manually scored. An F1 score of 0.82 was found for correspondence at the bud level between experts and DL. A correlation of 0.745 was found between the manually counted buds within the hotspots and the automated method in the 270 WSIs.
-
-                                                      Conclusion
-                                                      Assessment of tumor budding as a prognostic factor for colorectal cancer can be automated using deep learning. At the level of individual tumor buds, correspondence between DL and  experts is high and comparable to the inter-rater variability. However, compared to the manual procedure, the algorithm yields higher counts for cases with relatively high bud densities (>15). Follow-up studies will focus on the assessment of TB in H&E stained slides.},
+                                                        Tumor budding (TB) is an established prognosticator for colorectal cancer. Deep learning based TB assessment has the potential to improve diagnostic reproducibility and efficiency.  We developed an algorithm that can detect individual tumor buds in pan-cytokeratin stained colorectal cancer slides
+  
+                                                        Methods
+                                                        Tumor-bud candidates (n=1765, collected from 58 whole slide images; WSI) were labeled by seven experts as either TB, poorly differentiated cluster, or neither. The 58 slides were randomly split into a training (49) and test-set (9). A deep learning (DL) model was trained using the buds identified by the experts in the training set.
+  
+                                                        Results
+                                                        The algorithm was tested on the nine remaining WSI and 270 WSI from pan-cytokeratin stained slides from Bern University hospital, in which hot spots and TB were manually scored. An F1 score of 0.82 was found for correspondence at the bud level between experts and DL. A correlation of 0.745 was found between the manually counted buds within the hotspots and the automated method in the 270 WSIs.
+  
+                                                        Conclusion
+                                                        Assessment of tumor budding as a prognostic factor for colorectal cancer can be automated using deep learning. At the level of individual tumor buds, correspondence between DL and  experts is high and comparable to the inter-rater variability. However, compared to the manual procedure, the algorithm yields higher counts for cases with relatively high bud densities (>15). Follow-up studies will focus on the assessment of TB in H&E stained slides.},
   optnote = {DIAG},
   year = {2020},
 }
@@ -3924,8 +3947,8 @@ @article{Bort21
   url = {https://arxiv.org/abs/2006.06356},
   author = {Bortsova, Gerda and Gonz\'{a}lez-Gonzalo, Cristina and Wetstein, Suzanne C. and Dubost, Florian and Katramados, Ioannis and Hogeweg, Laurens and Liefers, Bart and van Ginneken, Bram and Pluim, Josien P.W. and Veta, Mitko and S\'{a}nchez, Clara I. and de Bruijne, Marleen},
   abstract = {Adversarial attacks are considered a potentially serious security threat for machine learning systems. Medical image analysis (MedIA) systems have recently been argued to be vulnerable to adversarial attacks due to strong financial incentives and the associated technological infrastructure. In this paper, we study previously unexplored factors affecting adversarial attack vulnerability of deep learning MedIA systems in three medical domains: ophthalmology, radiology, and pathology. We focus on adversarial black-box settings, in which the attacker does not have full access to the target model and usually uses another model, commonly referred to as surrogate model, to craft adversarial examples that are then transferred to the target model. We consider this to be the most realistic scenario for MedIA systems. Firstly, we study the effect of weight initialization (pre-training on ImageNet or random initialization) on the transferability of adversarial attacks from the surrogate model to the target model, i.e., how effective attacks crafted using the surrogate model are on the target model. Secondly, we study the influence of differences in development (training and validation) data between target and surrogate models. We further study the interaction of weight initialization and data differences with differences in model architecture. All experiments were done with a perturbation degree tuned to ensure maximal transferability at minimal visual perceptibility of the attacks. Our experiments show that pre-training may dramatically increase the transferability of adversarial examples, even when the target and surrogate's architectures are different: the larger the performance gain using pre-training, the larger the transferability. Differences in the development data between target and surrogate models considerably decrease the performance of the attack; this decrease is further amplified by difference in the model architecture. We believe these factors should be considered when developing security-critical MedIA systems planned to be deployed in clinical practice. We recommend avoiding using only standard components, such as pre-trained architectures and publicly available datasets, as well as disclosure of design specifications, in addition to using adversarial defense methods. When evaluating the vulnerability of MedIA systems to adversarial attacks, various attack scenarios and target-surrogate differences should be simulated to achieve realistic robustness estimates. The code and all trained models used in our experiments are publicly available.
-
-                             (The first three authors contributed equally to this work.)},
+  
+                               (The first three authors contributed equally to this work.)},
   publisher = {Elsevier},
   optnote = {DIAG},
   taverne_url = {https://repository.ubn.ru.nl/handle/2066/238599},
@@ -3998,7 +4021,7 @@ @inproceedings{Bosma23a
   title = {Reproducibility of Training Deep Learning Models for Medical Image Analysis},
   url = {https://openreview.net/forum?id=MR01DcGST9},
   abstract = {Performance of deep learning algorithms varies due to their development data and training method, but also due to several stochastic processes during training. Due to these random factors, a single training run may not accurately reflect the performance of a given training method. Statistical comparisons in literature between different deep learning training methods typically ignore this performance variation between training runs and incorrectly claim significance of changes in training method. We hypothesize that the impact of such performance variation is substantial, such that it may invalidate biomedical competition leaderboards and some scientific papers. To test this, we investigate the reproducibility of training deep learning algorithms for medical image analysis. We repeated training runs from prior scientific studies: three diagnostic tasks (pancreatic cancer detection in CT, clinically significant prostate cancer detection in MRI, and lung nodule malignancy risk estimation in low-dose CT) and two organ segmentation tasks (pancreas segmentation in CT and prostate segmentation in MRI). A previously published top-performing algorithm for each task was trained multiple times to determine the variance in model performance. For all three diagnostic algorithms, performance variation from retraining was significant compared to data variance. Statistically comparing independently trained algorithms from the same training method using the same dataset should follow the null hypothesis, but we observed claimed significance with a p-value below 0.05 in
-                           of comparisons with conventional testing (paired bootstrapping). We conclude that variance in model performance due to retraining is substantial and should be accounted for.},
+                             of comparisons with conventional testing (paired bootstrapping). We conclude that variance in model performance due to retraining is substantial and should be accounted for.},
   file = {Bosma23a.pdf:pdf\\Bosma23a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   year = {2023},
@@ -4144,34 +4167,34 @@ @article{Bozg24
   doi = {10.1007/s00330-024-10869-3},
   year = {2024},
   abstract = {Abstract
-               Objective
-               To review the components of past and present active surveillance (AS) protocols, provide an overview of the current studies employing artificial intelligence (AI) in AS of prostate cancer, discuss the current challenges of AI in AS, and offer recommendations for future research.
-
-               Methods
-               Research studies on the topic of MRI-based AI were reviewed to summarize current possibilities and diagnostic accuracies for AI methods in the context of AS. Established guidelines were used to identify possibilities for future refinement using AI.
-
-               Results
-               Preliminary results show the role of AI in a range of diagnostic tasks in AS populations, including the localization, follow-up, and prognostication of prostate cancer. Current evidence is insufficient to support a shift to AI-based AS, with studies being limited by small dataset sizes, heterogeneous inclusion and outcome definitions, or lacking appropriate benchmarks.
-
-               Conclusion
-               The AI-based integration of prostate MRI is a direction that promises substantial benefits for AS in the future, but evidence is currently insufficient to support implementation. Studies with standardized inclusion criteria and standardized progression definitions are needed to support this. The increasing inclusion of patients in AS protocols and the incorporation of MRI as a scheduled examination in AS protocols may help to alleviate these challenges in future studies.
-
-               Clinical relevance statement
-               This manuscript provides an overview of available evidence for the integration of prostate MRI and AI in active surveillance, addressing its potential for clinical optimizations in the context of established guidelines, while highlighting the main challenges for implementation.
-
-               Key Points
-               <jats:list list-type="bullet">
-                 <jats:list-item>
-                   Active surveillance is currently based on diagnostic tests such as PSA, biopsy, and imaging.
-                 </jats:list-item>
-                 <jats:list-item>
-                   Prostate MRI and AI demonstrate promising diagnostic accuracy across a variety of tasks, including the localization, follow-up and risk estimation in active surveillance cohorts.
-                 </jats:list-item>
-                 <jats:list-item>
-                   A transition to AI-based active surveillance is not currently realistic; larger studies using standardized inclusion criteria and outcomes are necessary to improve and validate existing evidence.
-                 </jats:list-item>
-
-             },
+                 Objective
+                 To review the components of past and present active surveillance (AS) protocols, provide an overview of the current studies employing artificial intelligence (AI) in AS of prostate cancer, discuss the current challenges of AI in AS, and offer recommendations for future research.
+  
+                 Methods
+                 Research studies on the topic of MRI-based AI were reviewed to summarize current possibilities and diagnostic accuracies for AI methods in the context of AS. Established guidelines were used to identify possibilities for future refinement using AI.
+  
+                 Results
+                 Preliminary results show the role of AI in a range of diagnostic tasks in AS populations, including the localization, follow-up, and prognostication of prostate cancer. Current evidence is insufficient to support a shift to AI-based AS, with studies being limited by small dataset sizes, heterogeneous inclusion and outcome definitions, or lacking appropriate benchmarks.
+  
+                 Conclusion
+                 The AI-based integration of prostate MRI is a direction that promises substantial benefits for AS in the future, but evidence is currently insufficient to support implementation. Studies with standardized inclusion criteria and standardized progression definitions are needed to support this. The increasing inclusion of patients in AS protocols and the incorporation of MRI as a scheduled examination in AS protocols may help to alleviate these challenges in future studies.
+  
+                 Clinical relevance statement
+                 This manuscript provides an overview of available evidence for the integration of prostate MRI and AI in active surveillance, addressing its potential for clinical optimizations in the context of established guidelines, while highlighting the main challenges for implementation.
+  
+                 Key Points
+                 <jats:list list-type="bullet">
+                   <jats:list-item>
+                     Active surveillance is currently based on diagnostic tests such as PSA, biopsy, and imaging.
+                   </jats:list-item>
+                   <jats:list-item>
+                     Prostate MRI and AI demonstrate promising diagnostic accuracy across a variety of tasks, including the localization, follow-up and risk estimation in active surveillance cohorts.
+                   </jats:list-item>
+                   <jats:list-item>
+                     A transition to AI-based active surveillance is not currently realistic; larger studies using standardized inclusion criteria and outcomes are necessary to improve and validate existing evidence.
+                   </jats:list-item>
+  
+               },
   url = {http://dx.doi.org/10.1007/s00330-024-10869-3},
   file = {Bozg24.pdf:pdf\\Bozg24.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -4208,19 +4231,19 @@ @article{Bozo17
   url = {http://dx.doi.org/10.1097/RTI.0000000000000255},
   volume = {32},
   abstract = {Purpose:
-                                     The aim of the study was to retrospectively evaluate the diagnostic imaging that potential lung donors undergo, the reader variability of image interpretation and its relevance for donation, and the potential information gained from imaging studies not primarily intended for lung evaluation but partially including them.
-
-
-                                     Materials and Methods:
-                                     Bedside chest radiography and computed tomography (CT), completely or incompletely including the lungs, of 110 brain-dead potential organ donors in a single institution during 2007 to 2014 were reviewed from a donation perspective. Two chest radiologists in consensus analyzed catheters and cardiovascular, parenchymal, and pleural findings. Clinical reports and study review were compared for substantial differences in findings that could have led to a treatment change, triggered additional examinations such as bronchoscopy, or were considered important for donation.
-
-
-                                     Results:
-                                     Among 136 bedside chest radiographs, no differences between clinical reports and study reviews were found in 37 (27%), minor differences were found in 28 (21%), and substantial differences were found in 71 (52%) examinations (<jats:italic toggle="yes">P&lt;0.0001). In 31 of 42 (74%) complete or incomplete CT examinations, 50 of 74 findings with relevance for lung donation were not primarily reported (<jats:italic toggle="yes">P&lt;0.0001).
-
-
-                                     Conclusions:
-                                     The majority of donor patients undergo only chest radiography. A targeted imaging review of abnormalities affecting the decision to use donor lungs may be useful in the preoperative stage. With a targeted list, substantial changes were made from initial clinical interpretations. CT can provide valuable information on donor lung pathology, even if the lungs are only partially imaged.},
+                                       The aim of the study was to retrospectively evaluate the diagnostic imaging that potential lung donors undergo, the reader variability of image interpretation and its relevance for donation, and the potential information gained from imaging studies not primarily intended for lung evaluation but partially including them.
+  
+  
+                                       Materials and Methods:
+                                       Bedside chest radiography and computed tomography (CT), completely or incompletely including the lungs, of 110 brain-dead potential organ donors in a single institution during 2007 to 2014 were reviewed from a donation perspective. Two chest radiologists in consensus analyzed catheters and cardiovascular, parenchymal, and pleural findings. Clinical reports and study review were compared for substantial differences in findings that could have led to a treatment change, triggered additional examinations such as bronchoscopy, or were considered important for donation.
+  
+  
+                                       Results:
+                                       Among 136 bedside chest radiographs, no differences between clinical reports and study reviews were found in 37 (27%), minor differences were found in 28 (21%), and substantial differences were found in 71 (52%) examinations (<jats:italic toggle="yes">P&lt;0.0001). In 31 of 42 (74%) complete or incomplete CT examinations, 50 of 74 findings with relevance for lung donation were not primarily reported (<jats:italic toggle="yes">P&lt;0.0001).
+  
+  
+                                       Conclusions:
+                                       The majority of donor patients undergo only chest radiography. A targeted imaging review of abnormalities affecting the decision to use donor lungs may be useful in the preoperative stage. With a targeted list, substantial changes were made from initial clinical interpretations. CT can provide valuable information on donor lung pathology, even if the lungs are only partially imaged.},
   all_ss_ids = {[5b9e4bf58d0091f82346ce17b0a0eeb2d404a499]},
   automatic = {yes},
   citation-count = {7},
@@ -4275,7 +4298,7 @@ @phdthesis{Brak00a
   year = {2000},
   url = {http://repository.ubn.ru.nl/handle/2066/18825},
   abstract = {This thesis describes the components of an automated detection method for masses and architectural distortions, signs of infiltrating cancer. Masses and architectural distortions can be very subtle and are frequently missed by radiologists. Because the success of treatment of breast cancer depends largely on the stage of the tumor at the time of detection, early detection is very important. Masses have two main image characteristics that can be used for detection: a radiating pattern of spicules and a mass. Sometimes both characteristics are present, but often only spicules or just a faint mass is visible. To achieve high sensitivity on the whole spectrum of possible appearances of masses and distortions, detection of both characteristics is essential. Chapter 2 describes a sensitive method to detect radiating spicule patterns using statistical analysis of line orientations. However, many masses do not show clear spiculation, and must be detected by their mass. Chapter 3 describes how the spicule detection method can be transformed to a mass detection method. Instead of a map of line orientations, a map of gradient orientations is computed. Statistical analysis of this orientation map was used to detect masses. A large set of mammograms taken from the Nijmegen screening program was used to test a detection method based on spicules, a detection method based on masses, and a detection method that detects both spicules and masses. Best results were obtained when both the spiculation and mass features were used. Of all masses, 85% was detected at a specificity level of 1 false positive per image, 55% at 1 false positive per 10 images. The diameter of masses in mammograms varies from 5 mm to 5 cm, inspiring many research groups to use multi-scale approaches to detect masses. However, the benefit of applying their method in a multi-scale way is almost never compared to a single-scale version of their method. In Chapter 4, the mass detection method of Chapter 3 and two popular pattern recognition techniques to detect bright areas were applied in a single and multi-scale way to examine the possible gain of multi-scale detection. It appeared that the multi-scale versions of the mass detection method had similar performance as a single-scale approach if this scale was chosen appropriately. Of course, when the scale for the single-scale approach was chosen sub-optimally the performance was lower. This study shows that it is not self-evident that a multi-scale mass detection method gives better results than a single-scale version of the method. A multi-scale method is sensitive for masses over a range of sizes, but is also sensitive for false positives of different sizes. The specificity level that was achieved by the mass detection method described in Chapter 3 is not high enough for successful application in the clinic or in screening. To improve the specificity, a second stage was designed, that classifies each detected region based on regional criteria like contrast, shape, and texture. Based on such features, many normal tissue regions could be discriminated from real masses. To compute these features, a segmentation of the suspicious regions is required. In Chapter 5, a method is described to segment masses 126 using a discrete dynamic contour model. For each region a size estimate was available of the suspect region, and an appropriate initial starting contour was created that was fitted to the edge of the region. The method proved to be fast and robust, and outperformed a region growing approach. In Chapter 6, the contour model was used to segment regions that were found by the mass detection method of Chapter 3. A number of features were implemented that capture image characteristics that radiologists use to determine whether a suspicious region is a mass or dense normal tissue. Classification using these regional features gave a large reduction in false positives at each desired sensitivity level. On two large datasets a relatively high sensitivity was achieved even at high specificity levels. In Chapter 7,
-                                                        all segmentation methods of Chapter 5 were used to segment and classify the detected regions. The adaptive discrete contour method that was used in Chapter 6 and the preprocessed probabilistic region growing method gave similar results. The experiments of Chapter 8 showed that a substantial number of the tumors that were missed by radiologists in a screening program despite double reading, were detected by the mass detection method of Chapter 3. Successful detection of missed tumors indicates that a CAD system can be a useful tool for radiologists if the prompts are sufficiently specific. Chapter 9 describes two experiments that were done using a commercially available prompting device. A large experiment showed that the specificity of radiologists does not decrease when they are prompted. This is an important result because some fear that the large number of false positive prompts of a CAD system might increases the recall rate. Results of a second experiment indicated that radiologists have much more difficulty with interpreting suspicious signs than is generally believed. It seems that many screening errors that are thought to be due to oversight, are due to misinterpretation. Both experiments showed large differences in the performance levels of radiologists. Detection of masses is reaching a level of performance where successful use in screening or clinical practice is possible. Approximately 75% of all masses are detected in at least one view at a specificity level of 0.1 false positives per image. Improvement of the mass and spicule features is still possible, and more sophisticated features can be used to remove false positives. Because the data sets that are used for training are becoming larger, better classifiers can be produced. A considerable improvement can be expected when suspicious regions in one view are correlated to suspicious regions in the other view. Many strong false positives are only present in one of the views, real lesions are most often visible in both. Together with asymmetry features and a method to detect temporal changes in mammograms, another considerable reduction in false positives seems possible},
+                                                          all segmentation methods of Chapter 5 were used to segment and classify the detected regions. The adaptive discrete contour method that was used in Chapter 6 and the preprocessed probabilistic region growing method gave similar results. The experiments of Chapter 8 showed that a substantial number of the tumors that were missed by radiologists in a screening program despite double reading, were detected by the mass detection method of Chapter 3. Successful detection of missed tumors indicates that a CAD system can be a useful tool for radiologists if the prompts are sufficiently specific. Chapter 9 describes two experiments that were done using a commercially available prompting device. A large experiment showed that the specificity of radiologists does not decrease when they are prompted. This is an important result because some fear that the large number of false positive prompts of a CAD system might increases the recall rate. Results of a second experiment indicated that radiologists have much more difficulty with interpreting suspicious signs than is generally believed. It seems that many screening errors that are thought to be due to oversight, are due to misinterpretation. Both experiments showed large differences in the performance levels of radiologists. Detection of masses is reaching a level of performance where successful use in screening or clinical practice is possible. Approximately 75% of all masses are detected in at least one view at a specificity level of 0.1 false positives per image. Improvement of the mass and spicule features is still possible, and more sophisticated features can be used to remove false positives. Because the data sets that are used for training are becoming larger, better classifiers can be produced. A considerable improvement can be expected when suspicious regions in one view are correlated to suspicious regions in the other view. Many strong false positives are only present in one of the views, real lesions are most often visible in both. Together with asymmetry features and a method to detect temporal changes in mammograms, another considerable reduction in false positives seems possible},
   copromotor = {N. Karssemeijer},
   file = {Brak00a.pdf:pdf\\Brak00a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -4865,7 +4888,7 @@ @phdthesis{Brun11a
   year = {2011},
   url = {http://repository.tue.nl/715250},
   abstract = {Deep brain stimulation of the subthalamic nucleus (STN) has gained momentum as a therapy for advanced ParkinsonA-A?A 1/2 s disease. The stimulation effectively alleviates the patientsA-A?A 1/2  typical motor symptoms on a long term, but can give rise to cognitive and psychiatric adverse effects as well. Based on primate studies, the STN has been divided into three functionally different parts, which were distinguished by their afferent and efferent connections. The largest part is the motor area, followed by an associative and a limbic area. The serious adverse effects on cognition and behavior occurring after deep brain stimulation are assumed to be caused by electrical current spread to the associative and limbic areas of the STN. Therefore, selective stimulation of the motor part of the STN seems crucial, both to obtain the best possible therapeutic effect on the motor symptoms and to minimize the debilitating effects on cognition and behavior. However, current medical imaging techniques do not yet facilitate the required accurate identification of the STN itself, let alone its different functional areas. The final target for DBS is still often adjusted using intraoperative electrophysiology. Therefore, in this thesis we aimed to improve imaging for deep brain stimulation using noninvasive MRI protocols, in order to identify the STN and its motor part. We studied the advantages and drawbacks of already available noninvasive methods to target the STN. This review did not lead to a straightforward conclusion; identification of the STN motor part remained an open question. In follow-up on this question, we investigated the possibility to distinguish the different functional STN parts based on their connectivity information. Three types of information were carefully analyzed in this thesis. First, we looked into the clustering of local diffusion information within the STN region. We visually inspected the complex diffusion profiles, derived from postmortem rat brain data with high angular resolution, and augmented this manual segmentation method using k-means and graph cuts clustering. Because the weighing of different orders of diffusion information in the traditionally used L2 norm on the orientation distribution functions (ODFs) remained an open issue, we developed a specialized distance measure, the so-called Sobolev norm. This norm does not only take into account the amplitudes of the diffusion profiles, but also their extrema. We showed it to perform better than the L2 norm on synthetic phantom data and real brain (thalamus) data. The research done on this topic facilitates better classification by clustering of gray matter structures in the (deep) brain. Secondly, we were the first to analyze the STNA-A?A 1/2 s full structural connectivity, based on probabilistic fiber tracking in diffusion MRI data of healthy volunteers. The results correspond well to topical literature on STN projections. Furthermore, we assessed the structural connectivity per voxel of the STN seed region and discovered a gradient in connectivity to the premotor cortex within the STN. While going from the medial to the lateral part of the STN, the connectivity increases, confirming the expected lateral location of the STN motor part. Finally, the connectivity analysis produced evidence for the existence of a A-A?A 1/2 hyperdirect?? pathway between the motor cortex and the STN in humans, which is very useful for future research into stimulation targets. The results of these experiments indicate that it is possible to find the motor part of the STN as specific target for deep brain stimulation using structural connectivity information acquired in a noninvasive way. Third and last, we studied functional connectivity using resting state functional MRI data of healthy volunteers. The resulting significant clusters provided us with the first complete description of the STNA-A?A 1/2 s resting state functional connectivity, which corresponds with the expectations based on available literature. Moreover, we performed a reverse regression procedure with the average time
-                                                       series signals in motor and limbic areas as principal regressors. The results were analyzed for each STN voxel separately and also showed mediolateral gradients in functional connectivity within the STN. The lateral STN part exhibited more motor connectivity, while the medial part seemed to be more functionally connected to limbic brain areas, as described in neuronal tracer studies. These results show that functional connectivity analysis also is a viable noninvasive method to find the motor part of the STN. The work on noninvasive MRI methods for identification of the STN and its functional parts, as presented in this thesis, thus contributes to future specific stimulation of the motor part of the STN for deep brain stimulation in patients with ParkinsonA-A?A 1/2 s disease. This may help to maximize the motor effects and minimize severe cognitive and psychiatric side effects.},
+                                                         series signals in motor and limbic areas as principal regressors. The results were analyzed for each STN voxel separately and also showed mediolateral gradients in functional connectivity within the STN. The lateral STN part exhibited more motor connectivity, while the medial part seemed to be more functionally connected to limbic brain areas, as described in neuronal tracer studies. These results show that functional connectivity analysis also is a viable noninvasive method to find the motor part of the STN. The work on noninvasive MRI methods for identification of the STN and its functional parts, as presented in this thesis, thus contributes to future specific stimulation of the motor part of the STN for deep brain stimulation in patients with ParkinsonA-A?A 1/2 s disease. This may help to maximize the motor effects and minimize severe cognitive and psychiatric side effects.},
   copromotor = {B. Platel},
   file = {Brun11a.pdf:pdf/Brun11a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -4982,19 +5005,19 @@ @article{Bult20
   url = {https://arxiv.org/abs/1907.07980},
   algorithm = {https://grand-challenge.org/algorithms/gleason-grading-of-prostate-biopsies/},
   abstract = {BACKGROUND:
-                                                       The Gleason score is the strongest correlating predictor of recurrence for prostate cancer, but has substantial inter-observer variability, limiting its usefulness for individual patients. Specialised urological pathologists have greater concordance; however, such expertise is not widely available. Prostate cancer diagnostics could thus benefit from robust, reproducible Gleason grading. We aimed to investigate the potential of deep learning to perform automated Gleason grading of prostate biopsies.
-
-                                                       METHODS:
-                                                       In this retrospective study, we developed a deep-learning system to grade prostate biopsies following the Gleason grading standard. The system was developed using randomly selected biopsies, sampled by the biopsy Gleason score, from patients at the Radboud University Medical Center (pathology report dated between Jan 1, 2012, and Dec 31, 2017). A semi-automatic labelling technique was used to circumvent the need for manual annotations by pathologists, using pathologists' reports as the reference standard during training. The system was developed to delineate individual glands, assign Gleason growth patterns, and determine the biopsy-level grade. For validation of the method, a consensus reference standard was set by three expert urological pathologists on an independent test set of 550 biopsies. Of these 550, 100 were used in an observer experiment, in which the system, 13 pathologists, and two pathologists in training were compared with respect to the reference standard. The system was also compared to an external test dataset of 886 cores, which contained 245 cores from a different centre that were independently graded by two pathologists.
-
-                                                       FINDINGS:
-                                                       We collected 5759 biopsies from 1243 patients. The developed system achieved a high agreement with the reference standard (quadratic Cohen's kappa 0*918, 95% CI 0*891-0*941) and scored highly at clinical decision thresholds: benign versus malignant (area under the curve 0*990, 95% CI 0*982-0*996), grade group of 2 or more (0*978, 0*966-0*988), and grade group of 3 or more (0*974, 0*962-0*984). In an observer experiment, the deep-learning system scored higher (kappa 0*854) than the panel (median kappa 0*819), outperforming 10 of 15 pathologist observers. On the external test dataset, the system obtained a high agreement with the reference standard set independently by two pathologists (quadratic Cohen's kappa 0*723 and 0*707) and within inter-observer variability (kappa 0*71).
-
-                                                       INTERPRETATION:
-                                                       Our automated deep-learning system achieved a performance similar to pathologists for Gleason grading and could potentially contribute to prostate cancer diagnosis. The system could potentially assist pathologists by screening biopsies, providing second opinions on grade group, and presenting quantitative measurements of volume percentages.
-
-                                                       FUNDING:
-                                                       Dutch Cancer Society.},
+                                                         The Gleason score is the strongest correlating predictor of recurrence for prostate cancer, but has substantial inter-observer variability, limiting its usefulness for individual patients. Specialised urological pathologists have greater concordance; however, such expertise is not widely available. Prostate cancer diagnostics could thus benefit from robust, reproducible Gleason grading. We aimed to investigate the potential of deep learning to perform automated Gleason grading of prostate biopsies.
+  
+                                                         METHODS:
+                                                         In this retrospective study, we developed a deep-learning system to grade prostate biopsies following the Gleason grading standard. The system was developed using randomly selected biopsies, sampled by the biopsy Gleason score, from patients at the Radboud University Medical Center (pathology report dated between Jan 1, 2012, and Dec 31, 2017). A semi-automatic labelling technique was used to circumvent the need for manual annotations by pathologists, using pathologists' reports as the reference standard during training. The system was developed to delineate individual glands, assign Gleason growth patterns, and determine the biopsy-level grade. For validation of the method, a consensus reference standard was set by three expert urological pathologists on an independent test set of 550 biopsies. Of these 550, 100 were used in an observer experiment, in which the system, 13 pathologists, and two pathologists in training were compared with respect to the reference standard. The system was also compared to an external test dataset of 886 cores, which contained 245 cores from a different centre that were independently graded by two pathologists.
+  
+                                                         FINDINGS:
+                                                         We collected 5759 biopsies from 1243 patients. The developed system achieved a high agreement with the reference standard (quadratic Cohen's kappa 0*918, 95% CI 0*891-0*941) and scored highly at clinical decision thresholds: benign versus malignant (area under the curve 0*990, 95% CI 0*982-0*996), grade group of 2 or more (0*978, 0*966-0*988), and grade group of 3 or more (0*974, 0*962-0*984). In an observer experiment, the deep-learning system scored higher (kappa 0*854) than the panel (median kappa 0*819), outperforming 10 of 15 pathologist observers. On the external test dataset, the system obtained a high agreement with the reference standard set independently by two pathologists (quadratic Cohen's kappa 0*723 and 0*707) and within inter-observer variability (kappa 0*71).
+  
+                                                         INTERPRETATION:
+                                                         Our automated deep-learning system achieved a performance similar to pathologists for Gleason grading and could potentially contribute to prostate cancer diagnosis. The system could potentially assist pathologists by screening biopsies, providing second opinions on grade group, and presenting quantitative measurements of volume percentages.
+  
+                                                         FUNDING:
+                                                         Dutch Cancer Society.},
   file = {:pdf/Bult20.pdf:PDF},
   optnote = {DIAG},
   pmid = {31926805},
@@ -5046,11 +5069,11 @@ @phdthesis{Bult22a
   year = {2022},
   url = {https://repository.ubn.ru.nl/handle/2066/241550},
   abstract = {The histological grading of prostate biopsies is a crucial element in the diagnostic pathway of prostate cancer. The known high inter- and intraobserver variability show potential and a need for assisting pathologists in this task. Furthermore, a global shortage of pathologists stresses the demand for reproducible, more efficient, and easily accessible diagnostic solutions. This thesis's primary aim was to investigate and design an AI-based system to detect and grade prostate cancer in biopsies. A second aim was to evaluate the potential clinical merits of AI-assisted grading when such systems are embedded in the pathologist's workflow. To this extent, the following objectives were undertaken as part of this thesis:
-
-                             1. The development of an automated system that can distinguish epithelial tissue from other tissue types within H&E stained prostate specimens (Chapter 2);
-                             2. The development and validation of an automated system for grading prostate biopsies using the Gleason grading system (Chapter 3);
-                             3. A multi-center independent evaluation of state-of-the-art algorithms for automated Gleason grading sourced through a large-scale medical AI competition(Chapter 4);
-                             4. The investigation of the potential merits of AI-assisted grading of prostate cancer through an observer study (Chapter 5).},
+  
+                               1. The development of an automated system that can distinguish epithelial tissue from other tissue types within H&E stained prostate specimens (Chapter 2);
+                               2. The development and validation of an automated system for grading prostate biopsies using the Gleason grading system (Chapter 3);
+                               3. A multi-center independent evaluation of state-of-the-art algorithms for automated Gleason grading sourced through a large-scale medical AI competition(Chapter 4);
+                               4. The investigation of the potential merits of AI-assisted grading of prostate cancer through an observer study (Chapter 5).},
   copromotor = {G. Litjens and C. Hulbergen-van de Kaa},
   file = {Bult22a.pdf:pdf\\Bult22a.pdf:PDF},
   optnote = {DIAG},
@@ -5113,9 +5136,9 @@ @article{Caba21
   doi = {10.1117/1.jmi.8.2.024501},
   year = {2021},
   abstract = {Purpose: A computer-aided diagnosis (CADx) system for breast masses is proposed, which incorporates both handcrafted and convolutional radiomic features embedded into a single deep learning model.
-                         Approach: The model combines handcrafted and convolutional radiomic signatures into a multi-view architecture, which retrieves three-dimensional (3D) image information by simultaneously processing multiple two-dimensional mass patches extracted along different planes through the 3D mass volume. Each patch is processed by a stream composed of two concatenated parallel branches: a multi-layer perceptron fed with automatically extracted handcrafted radiomic features, and a convolutional neural network, for which discriminant features are learned from the input patches. All streams are then concatenated together into a final architecture, where all network weights are shared and the learning occurs simultaneously for each stream and branch. The CADx system was developed and tested for diagnosis of breast masses (N  =  284) using image datasets acquired with independent dedicated breast computed tomography systems from two different institutions. The diagnostic classification performance of the CADx system was compared against other machine and deep learning architectures adopting handcrafted and convolutional approaches, and three board-certified breast radiologists.
-                         Results: On a test set of 82 masses (45 benign, 37 malignant), the proposed CADx system performed better than all other model architectures evaluated, with an increase in the area under the receiver operating characteristics curve (AUC) of 0.05  +-  0.02, and achieving a final AUC of 0.947, outperforming the three radiologists (AUC  =  0.814  -  0.902).
-                         Conclusions: In conclusion, the system demonstrated its potential usefulness in breast cancer diagnosis by improving mass malignancy assessment.},
+                           Approach: The model combines handcrafted and convolutional radiomic signatures into a multi-view architecture, which retrieves three-dimensional (3D) image information by simultaneously processing multiple two-dimensional mass patches extracted along different planes through the 3D mass volume. Each patch is processed by a stream composed of two concatenated parallel branches: a multi-layer perceptron fed with automatically extracted handcrafted radiomic features, and a convolutional neural network, for which discriminant features are learned from the input patches. All streams are then concatenated together into a final architecture, where all network weights are shared and the learning occurs simultaneously for each stream and branch. The CADx system was developed and tested for diagnosis of breast masses (N  =  284) using image datasets acquired with independent dedicated breast computed tomography systems from two different institutions. The diagnostic classification performance of the CADx system was compared against other machine and deep learning architectures adopting handcrafted and convolutional approaches, and three board-certified breast radiologists.
+                           Results: On a test set of 82 masses (45 benign, 37 malignant), the proposed CADx system performed better than all other model architectures evaluated, with an increase in the area under the receiver operating characteristics curve (AUC) of 0.05  +-  0.02, and achieving a final AUC of 0.947, outperforming the three radiologists (AUC  =  0.814  -  0.902).
+                           Conclusions: In conclusion, the system demonstrated its potential usefulness in breast cancer diagnosis by improving mass malignancy assessment.},
   url = {http://dx.doi.org/10.1117/1.JMI.8.2.024501},
   file = {Caba21.pdf:pdf\\Caba21.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -5135,16 +5158,16 @@ @inproceedings{Call19
   year = {2019},
   series = SPIE,
   abstract = {In this work we analyze the eect of label noise in training and test data when performing classication experi-
-                                                       ments on chest radiographs (CXRs) with modern deep learning architectures. We use ChestXRay14, the largest
-                                                       publicly available CXR dataset. We simulate situs inversus by horizontal ipping of the CXRs, allowing us to
-                                                       precisely control the amount of label noise. We also perform experiments in classifying emphysema using the
-                                                       ChestXRay14 provided labels that are known to be noisy. Our situs inversus experiments conrm results from
-                                                       the computer vision literature that deep learning architectures are relatively robust but not completely insensi-
-                                                       tive to label noise in the training data: without or with very low noise, classication results are near perfect; 16%
-                                                       and 32% training label noise only lead to a 1.5% and 4.6% drop in accuracy. We investigate two metrics that
-                                                       could be used to identify test samples that have an incorrect label: model condence and model uncertainty. We
-                                                       show, in an observer study with an experienced chest radiologist, that both measures are eective in identifying
-                                                       samples in ChestXRay14 that are erroneously labeled for the presence of emphysema.},
+                                                         ments on chest radiographs (CXRs) with modern deep learning architectures. We use ChestXRay14, the largest
+                                                         publicly available CXR dataset. We simulate situs inversus by horizontal ipping of the CXRs, allowing us to
+                                                         precisely control the amount of label noise. We also perform experiments in classifying emphysema using the
+                                                         ChestXRay14 provided labels that are known to be noisy. Our situs inversus experiments conrm results from
+                                                         the computer vision literature that deep learning architectures are relatively robust but not completely insensi-
+                                                         tive to label noise in the training data: without or with very low noise, classication results are near perfect; 16%
+                                                         and 32% training label noise only lead to a 1.5% and 4.6% drop in accuracy. We investigate two metrics that
+                                                         could be used to identify test samples that have an incorrect label: model condence and model uncertainty. We
+                                                         show, in an observer study with an experienced chest radiologist, that both measures are eective in identifying
+                                                         samples in ChestXRay14 that are erroneously labeled for the presence of emphysema.},
   file = {Call19.pdf:pdf\\Call19.pdf:PDF},
   optnote = {DIAG},
   number = {1},
@@ -5383,15 +5406,15 @@ @conference{Char15a
   year = {2015},
   url = {http://rsna2015.rsna.org/program/},
   abstract = {PURPOSE
-                                                       Changes in the morphology of the airways contributes to lung function impairment in chronic obstructive pulmonary disease (COPD). Measurements of airway morphology might be influenced by the quality of the airway segmentation. In this study we investigate the stability of a commonly used airway measurement (Pi10) from CT scans for varying segmentation depths of the airways.
-                                                       METHOD AND MATERIALS
-                                                       Inspiratory low-dose thoracic CT scans of 267 subjects, well distributed over GOLD stages, were selected for this study. Airways were automatically extracted by a state-of-the-art segmentation method and manually corrected to ensure a leakage free segmentation. Airway wall thickness quantification was performed in orthogonal cross-sections every 1mm throughout the entire airway tree using an intensity-integration technique which accounts for partial volume effects. Using regression on all cross-sectional measurements, airway morphology was expressed as the square root of wall area at airways with a perimeter of 10mm (Pi10). To determine the sensitivity of the Pi10 measurement to the length of the segmented airway tree, sensitivity analysis was performed on Pi10 by leaving-out wall measurements of the smallest airways and recalculating the Pi10. For each subject, Pi10 regression analysis was repeated excluding airways with a lumen perimeter below 6mm, 8mm or 10mm. The recalculated Pi10 measurements were compared to the baseline Pi10.
-                                                       RESULTS
-                                                       The segmented airway trees consisted for 55% of airways with lumen diameters below 10mm, 19% below 8mm, and 1% below 6mm.The average baseline Pi10 of all subjects was 2.43 +/- 0.56 (range [1.40, 4.36]), which corresponds to an average airway wall thickness (for an airway with a lumen perimeter of 10mm) of 0.52mm +/- 0.21mm. By excluding airways with a lumen perimeter below 6, 8 or 10mm from the regression analysis, absolute changes in Pi10 were 0.003 +/- 0.004 (0.11%), 0.035 +/- 0.023 (1.46%), and 0.107 +/- 0.087 (4.6%), respectively, corresponding to changes in airway wall thickness (at 10mm lumen perimeter) of 0.001, 0.013, and 0.039mm.
-                                                       CONCLUSION
-                                                       The commonly used Pi10 measurement to express airway morphology from a CT scan is insensitive to the exclusion of smaller airways in the computation.
-                                                       CLINICAL RELEVANCE/APPLICATION
-                                                       When expressing airway morhplogy as Pi10, there is no need to (manually) adjust automatic airway segmentation methods to include smaller airways in order to obtain an accurate Pi10 measurement.},
+                                                         Changes in the morphology of the airways contributes to lung function impairment in chronic obstructive pulmonary disease (COPD). Measurements of airway morphology might be influenced by the quality of the airway segmentation. In this study we investigate the stability of a commonly used airway measurement (Pi10) from CT scans for varying segmentation depths of the airways.
+                                                         METHOD AND MATERIALS
+                                                         Inspiratory low-dose thoracic CT scans of 267 subjects, well distributed over GOLD stages, were selected for this study. Airways were automatically extracted by a state-of-the-art segmentation method and manually corrected to ensure a leakage free segmentation. Airway wall thickness quantification was performed in orthogonal cross-sections every 1mm throughout the entire airway tree using an intensity-integration technique which accounts for partial volume effects. Using regression on all cross-sectional measurements, airway morphology was expressed as the square root of wall area at airways with a perimeter of 10mm (Pi10). To determine the sensitivity of the Pi10 measurement to the length of the segmented airway tree, sensitivity analysis was performed on Pi10 by leaving-out wall measurements of the smallest airways and recalculating the Pi10. For each subject, Pi10 regression analysis was repeated excluding airways with a lumen perimeter below 6mm, 8mm or 10mm. The recalculated Pi10 measurements were compared to the baseline Pi10.
+                                                         RESULTS
+                                                         The segmented airway trees consisted for 55% of airways with lumen diameters below 10mm, 19% below 8mm, and 1% below 6mm.The average baseline Pi10 of all subjects was 2.43 +/- 0.56 (range [1.40, 4.36]), which corresponds to an average airway wall thickness (for an airway with a lumen perimeter of 10mm) of 0.52mm +/- 0.21mm. By excluding airways with a lumen perimeter below 6, 8 or 10mm from the regression analysis, absolute changes in Pi10 were 0.003 +/- 0.004 (0.11%), 0.035 +/- 0.023 (1.46%), and 0.107 +/- 0.087 (4.6%), respectively, corresponding to changes in airway wall thickness (at 10mm lumen perimeter) of 0.001, 0.013, and 0.039mm.
+                                                         CONCLUSION
+                                                         The commonly used Pi10 measurement to express airway morphology from a CT scan is insensitive to the exclusion of smaller airways in the computation.
+                                                         CLINICAL RELEVANCE/APPLICATION
+                                                         When expressing airway morhplogy as Pi10, there is no need to (manually) adjust automatic airway segmentation methods to include smaller airways in order to obtain an accurate Pi10 measurement.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -5402,15 +5425,15 @@ @conference{Char15b
   year = {2015},
   url = {http://rsna2015.rsna.org/program/},
   abstract = {PURPOSE
-                                                       Automated classification of pulmonary arteries and veins in thoracic CT scans is an unsolved problem which is important for e.g. CAD of pulmonary embolisms and treatment planning. This study presents and validates a new anatomy-based method to automatically classify arteries and veins in non-contrast chest CT scans.
-                                                       METHOD AND MATERIALS
-                                                       A set of 55 full inspiration non-contrast low dose chest CT scans (16x0.75mm, 120-140kVp, 30mAs) with variable severity of emphysema and interstitial lung diseases, were taken from a lung cancer screening trial. In all state-of-the-art vessel segmentation algorithms, arteries and veins are attached at locations where they cross, since these algorithms are not designed to distinguish between bifurcating and crossing vessels. This method starts with automatic vessel segmentation, followed by pruning the vessel segmentation to detect locations that are inconsistent with the topology of a tree structure. By disconnecting the vessels at these locations, the vessel segmentation is separated into subtrees that fulfill a tree structure and are assumed to be of an arterial or venous label. Next, subtrees are grouped using anatomical knowledge that arterial and venous capillaries meet each other at the alveoli, which implies that the corresponding peripheral arteries and veins go towards similar regions. By analyzing the peripheral vessels in each subtree, subtrees of the same artery-vein label are grouped without knowing the actual label. To extract the final artery-vein labels of the grouped subtrees, classification is performed using the fact that veins have an overall larger volume compared to arteries. For quantitative evaluation, two human observers manually labeled a total of 2750 randomly selected arteries and veins from all 55 scans. The accuracy and Cohen's kappa between the observers and between the method and observers were used for evaluation.
-                                                       RESULTS
-                                                       Inter-observer Cohen's kappa was 0.84 with 93% accuracy. The proposed method achieved a mean accuracy of 88% and a Cohen's kappa of 0.76.
-                                                       CONCLUSION
-                                                       A new concept for artery-vein separation and classification was presented that uses anatomical information from peripheral arteries and veins. The performance of the presented method closely approximated the inter-observer agreement.
-                                                       CLINICAL RELEVANCE/APPLICATION
-                                                       Automatic artery-vein classification is essential for investigating pulmonary hypertension, COPD and for improving CAD systems for pulmonary embolisms.},
+                                                         Automated classification of pulmonary arteries and veins in thoracic CT scans is an unsolved problem which is important for e.g. CAD of pulmonary embolisms and treatment planning. This study presents and validates a new anatomy-based method to automatically classify arteries and veins in non-contrast chest CT scans.
+                                                         METHOD AND MATERIALS
+                                                         A set of 55 full inspiration non-contrast low dose chest CT scans (16x0.75mm, 120-140kVp, 30mAs) with variable severity of emphysema and interstitial lung diseases, were taken from a lung cancer screening trial. In all state-of-the-art vessel segmentation algorithms, arteries and veins are attached at locations where they cross, since these algorithms are not designed to distinguish between bifurcating and crossing vessels. This method starts with automatic vessel segmentation, followed by pruning the vessel segmentation to detect locations that are inconsistent with the topology of a tree structure. By disconnecting the vessels at these locations, the vessel segmentation is separated into subtrees that fulfill a tree structure and are assumed to be of an arterial or venous label. Next, subtrees are grouped using anatomical knowledge that arterial and venous capillaries meet each other at the alveoli, which implies that the corresponding peripheral arteries and veins go towards similar regions. By analyzing the peripheral vessels in each subtree, subtrees of the same artery-vein label are grouped without knowing the actual label. To extract the final artery-vein labels of the grouped subtrees, classification is performed using the fact that veins have an overall larger volume compared to arteries. For quantitative evaluation, two human observers manually labeled a total of 2750 randomly selected arteries and veins from all 55 scans. The accuracy and Cohen's kappa between the observers and between the method and observers were used for evaluation.
+                                                         RESULTS
+                                                         Inter-observer Cohen's kappa was 0.84 with 93% accuracy. The proposed method achieved a mean accuracy of 88% and a Cohen's kappa of 0.76.
+                                                         CONCLUSION
+                                                         A new concept for artery-vein separation and classification was presented that uses anatomical information from peripheral arteries and veins. The performance of the presented method closely approximated the inter-observer agreement.
+                                                         CLINICAL RELEVANCE/APPLICATION
+                                                         Automatic artery-vein classification is essential for investigating pulmonary hypertension, COPD and for improving CAD systems for pulmonary embolisms.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -5420,14 +5443,14 @@ @conference{Char16
   booktitle = ATS,
   year = {2016},
   abstract = {{RATIONALE:}
-                                                       To evaluate the relative contributions of quantitative {CT} ({QCT}) measures of emphysema, air trapping, and airway wall thickening and narrowing, to airflow obstruction in cigarette smokers with and without chronic obstructive lung disease ({COPD}).
-                                                       {METHOD:}
-                                                       2000 cigarette smokers participating in the {COPDG}ene study were evaluated, 818 subjects were excluded because of missing {QCT}. Thirona Lung Quantification software was used to extract {QCT} measures from inspiratory and expiratory {CT} scans for each subject, including emphysema (%{LAA}-950, defined as the percentage of low attenuation areas ({LAA}) below -950{HU} in inspiratory scans), gas trapping (%{LAA}-856, defined as the percentage of {LAA} below -856{HU} in expiration), and an index score for airway wall thickening and/or narrowing (Pi10, defined as the root of the wall area of a hypothetical airway of 10-mm internal perimeter). The evaluated spirometry measures included the ratio of forced expiratory volume in 1 second ({FEV}1) and forced vital capacity ({FVC}), and the predicted percentage of {FEV}1 ({FEV}1%-predicted).
-                                                       {QCT} measures were correlated to {FEV}1/{FVC} and {FEV}1%-predicted using Pearson correlation. In addition, multiple linear regression analysis was used to evaluate the predicted value of the {QCT} measures on both {FEV}1/{FVC} and {FEV}1%-predicted. For these models, the spirometry measures were log10-transformed to ensure a distribution of residuals closer to normal.
-                                                       {RESULTS:}
-                                                       The 1183 subjects were divided over {GOLD} stagesChar16 0 to 4:478, 100, 279, 143 and 47. 136 subjects were unclassified by {GOLD}. %{LAA}-950, %{LAA}-856, and Pi10 correlated significantly with both {FEV}1/{FVC} (p<0.0001, r= -0.758, r=-0.829, and r=-0.423, respectively) and {FEV}1%-predicted (p<0.0001, r= -0.628, r=-0.728, and r=-0.547, respectively). In the regression model for {FEV}1/{FVC}, the combination of the three {QCT} measures accounted for 74.5% of the variation in {FEV}1/{FVC}, with a relative contribution of 68.7% for %{LAA}-856, 3.1% for Pi10, and 2.6% for %{LAA}-950. In the regression model for {FEV}1%-predicted, the combination of the three {QCT} measures accounted for 65.8% of the variation in {FEV}1%-predicted, with a relative contribution of 52.9% for %{LAA}-856, 11.6% for Pi10, and 1.3% for %{LAA}-950.
-                                                       {CONCLUSION:}
-                                                       Gas trapping and airway wall thickening and/or narrowing are the major contributors to airflow obstruction in cigarette smokers.},
+                                                         To evaluate the relative contributions of quantitative {CT} ({QCT}) measures of emphysema, air trapping, and airway wall thickening and narrowing, to airflow obstruction in cigarette smokers with and without chronic obstructive lung disease ({COPD}).
+                                                         {METHOD:}
+                                                         2000 cigarette smokers participating in the {COPDG}ene study were evaluated, 818 subjects were excluded because of missing {QCT}. Thirona Lung Quantification software was used to extract {QCT} measures from inspiratory and expiratory {CT} scans for each subject, including emphysema (%{LAA}-950, defined as the percentage of low attenuation areas ({LAA}) below -950{HU} in inspiratory scans), gas trapping (%{LAA}-856, defined as the percentage of {LAA} below -856{HU} in expiration), and an index score for airway wall thickening and/or narrowing (Pi10, defined as the root of the wall area of a hypothetical airway of 10-mm internal perimeter). The evaluated spirometry measures included the ratio of forced expiratory volume in 1 second ({FEV}1) and forced vital capacity ({FVC}), and the predicted percentage of {FEV}1 ({FEV}1%-predicted).
+                                                         {QCT} measures were correlated to {FEV}1/{FVC} and {FEV}1%-predicted using Pearson correlation. In addition, multiple linear regression analysis was used to evaluate the predicted value of the {QCT} measures on both {FEV}1/{FVC} and {FEV}1%-predicted. For these models, the spirometry measures were log10-transformed to ensure a distribution of residuals closer to normal.
+                                                         {RESULTS:}
+                                                         The 1183 subjects were divided over {GOLD} stagesChar16 0 to 4:478, 100, 279, 143 and 47. 136 subjects were unclassified by {GOLD}. %{LAA}-950, %{LAA}-856, and Pi10 correlated significantly with both {FEV}1/{FVC} (p<0.0001, r= -0.758, r=-0.829, and r=-0.423, respectively) and {FEV}1%-predicted (p<0.0001, r= -0.628, r=-0.728, and r=-0.547, respectively). In the regression model for {FEV}1/{FVC}, the combination of the three {QCT} measures accounted for 74.5% of the variation in {FEV}1/{FVC}, with a relative contribution of 68.7% for %{LAA}-856, 3.1% for Pi10, and 2.6% for %{LAA}-950. In the regression model for {FEV}1%-predicted, the combination of the three {QCT} measures accounted for 65.8% of the variation in {FEV}1%-predicted, with a relative contribution of 52.9% for %{LAA}-856, 11.6% for Pi10, and 1.3% for %{LAA}-950.
+                                                         {CONCLUSION:}
+                                                         Gas trapping and airway wall thickening and/or narrowing are the major contributors to airflow obstruction in cigarette smokers.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -5437,7 +5460,7 @@ @conference{Char16a
   booktitle = RSNA,
   year = {2016},
   abstract = {PURPOSE: We investigated the relationship between airway dimensions and airflow obstruction and respiratory quality of life in current and former cigarette smokers. METHOD AND MATERIALS:
-                                                       Cigarette smokers were studied that enrolled in the COPDGene study. Spirometry assessment included forced expiratory volume in 1 sec (FEV1), forced vital capacity (FVC), % predicted FEV1 (FEV1%-p), % predicted FVC (FVC%-p), and peak expiratory flow (PEF). Respiratory quality of life was assessed by the St George's Respiratory Questionnaire (SGRQ) score and 6 Minute Walking Distance (SMWD). Inspiratory CT was available to extract the airways, the amount of emphysema, and the total lung capacity (TLC). Lumen perimeters and airway wall areas were automatically extracted perpendicular to the airways. Linear regression was performed on these measurements to calculate an index score of airway wall thickness, expressed as the square root of wall area at airways with a perimeter of 10mm (Pi10). Emphysema was defined as the percentage of low-attenuation area below -950 HU (LAA%-950). Multiple linear regression was used to determine the predictive value of Pi10 and smoking status on airflow obstruction and respiratory quality of life. An interaction was included in the model to investigate if the effect of Pi10 differed by smoking status. All models were adjusted for age, gender, body mass index, pack years, bronchodilator responsiveness, TLC, and LAA%-950. RESULTS: 1544 cigarette smokers (894 former smokers) were included, with a mean age of 60.7 A-A?A 1/2  8.9 years and a mean Pi10 of 2.23 A-A?A 1/2  0.57mm. Pi10 was significantly associated with all airflow obstruction and respiratory quality of life measures (all p<0.001). The interaction between Pi10 and smoking status was significant for all measures except FVC%-p (p=0.30) and SGRQ score (p=0.064). This indicates that the effect of Pi10 on FEV1%-p, PEF, FEV1/FVC and SMWD was significantly reduced in current smokers compared to former smokers. CONCLUSION: Pi10 independently contributes to airflow obstruction and respiratory quality of life. This effect is stronger in former smokers as compared to current smokers. CLNICAL RELEVANCE/APPLICATION: Pi10 is an independent marker for airflow obstruction and respiratory quality of life and may be more strongly associated with these outcomes in former smokers than current smokers.},
+                                                         Cigarette smokers were studied that enrolled in the COPDGene study. Spirometry assessment included forced expiratory volume in 1 sec (FEV1), forced vital capacity (FVC), % predicted FEV1 (FEV1%-p), % predicted FVC (FVC%-p), and peak expiratory flow (PEF). Respiratory quality of life was assessed by the St George's Respiratory Questionnaire (SGRQ) score and 6 Minute Walking Distance (SMWD). Inspiratory CT was available to extract the airways, the amount of emphysema, and the total lung capacity (TLC). Lumen perimeters and airway wall areas were automatically extracted perpendicular to the airways. Linear regression was performed on these measurements to calculate an index score of airway wall thickness, expressed as the square root of wall area at airways with a perimeter of 10mm (Pi10). Emphysema was defined as the percentage of low-attenuation area below -950 HU (LAA%-950). Multiple linear regression was used to determine the predictive value of Pi10 and smoking status on airflow obstruction and respiratory quality of life. An interaction was included in the model to investigate if the effect of Pi10 differed by smoking status. All models were adjusted for age, gender, body mass index, pack years, bronchodilator responsiveness, TLC, and LAA%-950. RESULTS: 1544 cigarette smokers (894 former smokers) were included, with a mean age of 60.7 A-A?A 1/2  8.9 years and a mean Pi10 of 2.23 A-A?A 1/2  0.57mm. Pi10 was significantly associated with all airflow obstruction and respiratory quality of life measures (all p<0.001). The interaction between Pi10 and smoking status was significant for all measures except FVC%-p (p=0.30) and SGRQ score (p=0.064). This indicates that the effect of Pi10 on FEV1%-p, PEF, FEV1/FVC and SMWD was significantly reduced in current smokers compared to former smokers. CONCLUSION: Pi10 independently contributes to airflow obstruction and respiratory quality of life. This effect is stronger in former smokers as compared to current smokers. CLNICAL RELEVANCE/APPLICATION: Pi10 is an independent marker for airflow obstruction and respiratory quality of life and may be more strongly associated with these outcomes in former smokers than current smokers.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -5447,7 +5470,7 @@ @conference{Char16b
   booktitle = RSNA,
   year = {2016},
   abstract = {PURPOSE: To predict COPD and smoking-related morbidity in cigarette smokers using quantitative CT (QCT) measures. METHOD AND MATERIALS: 1544 subjects were included from the COPDGene study. COPD was defined by a ratio of forced expiratory volume in 1 sec. (FEV1) and forced vital capacity (FVC) < 0.7. Smoking-related morbidity was defined as FEV1/FVC < 0.70 with either a St George's Respiratory Questionnaire score >= 25 or an exacerbation frequency >= 2/year. On inspiratory CT, multiple cross-sectional lumen perimeters and airway wall areas were extracted from the airways. Using linear regression, airway wall thickness was defined as the square root of wall area of an airway with a perimeter of 10mm (Pi10). Total lung capacity (TLC) and emphysema were measured on inspiratory CT, where emphysema was defined as the % of low-attenuation areas (LAA%) < -950HU (LAA%-950). Air-trapping was defined on expiratory CT as LAA% < -856HU (LAA%-856). Six logistic regression models were fitted for both the prediction of COPD and smoking-related morbidity using a random subset of 761 subjects. Model 1 included only age, gender, BMI, pack years, smoking status, and TLC, while models 2 to 6 additionally included: LAA%-950 (model 2), LAA%-856 (model 3), Pi10 (model 4), LAA%-950 + Pi10 (model 5), and LAA%-950 + LAA%-856 + Pi10 (model 6). The models were validated on a separate set (810 subjects) using the area under the receiver operating curve (AUC). RESULTS:
-                                                       The validation set consisted of 369 subjects with and 441 without COPD. QCT measures were independent predictors of COPD in all models (p<0.001), with AUC values for models 1 to 6 of 0.77, 0.85, 0.90, 0.87, 0.91, and 0.93, respectively. The validation set consisted of 216 subject with and 594 without smoking-related morbidity. QCT measures were independent predictors of smoking-related morbidity in all models (p<0.001, except for LAA%-950 in model 5), with AUC values for models 1 to 6 of 0.72, 0.83, 0.87, 0.83, 0.88, and 0.89, respectively. CONCLUSION: LAA%-950, LAA%-856, and Pi10 are independent predictors of COPD and smoking-related morbidity. The model including only inspiratory QCT predictors has similar predictive value to the model that also includes expiratory air-trapping. CLNICAL RELEVANCE/APPLICATION: Since LAA%-950 and Pi10 can be readily extracted from inspiratory images, these measures may be useful to predict smoking related morbidity in lung cancer screening.},
+                                                         The validation set consisted of 369 subjects with and 441 without COPD. QCT measures were independent predictors of COPD in all models (p<0.001), with AUC values for models 1 to 6 of 0.77, 0.85, 0.90, 0.87, 0.91, and 0.93, respectively. The validation set consisted of 216 subject with and 594 without smoking-related morbidity. QCT measures were independent predictors of smoking-related morbidity in all models (p<0.001, except for LAA%-950 in model 5), with AUC values for models 1 to 6 of 0.72, 0.83, 0.87, 0.83, 0.88, and 0.89, respectively. CONCLUSION: LAA%-950, LAA%-856, and Pi10 are independent predictors of COPD and smoking-related morbidity. The model including only inspiratory QCT predictors has similar predictive value to the model that also includes expiratory air-trapping. CLNICAL RELEVANCE/APPLICATION: Since LAA%-950 and Pi10 can be readily extracted from inspiratory images, these measures may be useful to predict smoking related morbidity in lung cancer screening.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -5461,8 +5484,8 @@ @article{Char16c
   doi = {10.1016/j.media.2016.11.001},
   url = {http://dx.doi.org/10.1016/j.media.2016.11.001},
   abstract = {We propose a novel method to improve airway segmentation in thoracic computed tomography (CT) by detecting and removing leaks. Leak detection is formulated as a classification problem, in which a convolutional network (ConvNet) is trained in a supervised fashion to perform the classification task. In order to increase the segmented airway tree length, we take advantage of the fact that multiple segmentations can be extracted from a given airway segmentation algorithm by varying the parameters that influence the tree length and the amount of leaks. We propose a strategy in which the combination of these segmentations after removing leaks can increase the airway tree length while limiting the amount of leaks. This strategy therefore largely circumvents the need for parameter fine-tuning of a given airway segmentation algorithm.
-
-                                                       The ConvNet was trained and evaluated using a subset of inspiratory thoracic CT scans taken from the COPDGene study. Our method was validated on a separate independent set of the EXACT?09 challenge. We show that our method significantly improves the quality of a given leaky airway segmentation, achieving a higher sensitivity at a low false-positive rate compared to all the state-of-the-art methods that entered in EXACT09, and approaching the performance of the combination of all of them.},
+  
+                                                         The ConvNet was trained and evaluated using a subset of inspiratory thoracic CT scans taken from the COPDGene study. Our method was validated on a separate independent set of the EXACT?09 challenge. We show that our method significantly improves the quality of a given leaky airway segmentation, achieving a higher sensitivity at a low false-positive rate compared to all the state-of-the-art methods that entered in EXACT09, and approaching the performance of the combination of all of them.},
   file = {Char16c.pdf:pdf\\Char16c.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {27842236},
@@ -5547,17 +5570,18 @@ @article{Chav93
   gscites = {16},
 }
 
-@inproceedings{Chel22,
-  author = {Eduard Chelebian and Francesco Ciompi},
-  booktitle = {Learning Meaningful Representations of Life, NeurIPS 2022},
+@article{Chel22,
+  author = {Chelebian, Eduard and Ciompi, Francesco and W\"{a}hlby, Carolina},
   title = {Seeded iterative clustering for histology region identification},
+  doi = {10.48550/ARXIV.2211.07425},
+  year = {2022},
   abstract = {Annotations are necessary to develop computer vision algorithms for histopathology, but dense annotations at a high resolution are often time-consuming to make. Deep learning models for segmentation are a way to alleviate the process, but require large amounts of training data, training times and computing power. To address these issues, we present seeded iterative clustering to produce a coarse segmentation densely and at the whole slide level. The algorithm uses precomputed representations as the clustering space and a limited amount of sparse interactive annotations as seeds to iteratively classify image patches. We obtain a fast and effective way of generating dense annotations for whole slide images and a framework that allows the comparison of neural network latent representations in the context of transfer learning.},
+  url = {https://arxiv.org/abs/2211.07425},
   file = {Chel22.pdf:pdf\\Chel22.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
-  year = {2022},
-  ss_id = {892d4f3ee2cc9ff2dcf1c308fae68473dcf787d2},
+  journal = {5},
+  automatic = {yes},
   all_ss_ids = {['892d4f3ee2cc9ff2dcf1c308fae68473dcf787d2']},
-  gscites = {1},
 }
 
 @article{Chel24,
@@ -5648,13 +5672,13 @@ @phdthesis{Chle22
   title = {Deep Learning-Based Segmentation in Multimodal Abdominal Imaging},
   url = {https://repository.ubn.ru.nl/handle/2066/251471},
   abstract = {This thesis is devoted to the applications of deep learning segmentation algorithms to multimodal abdominal imaging. It focuses on the segmentation of liver, prostate, and liver tumors in CT and MRI images.
-                             It aims not only to propose and evaluate new segmentation architectures, but also to investigate aspects such as the required time for the correction of automatic segmentation results, the impact on the inter-observer variability, and the optimization of annotation effort.
-                             The following objectives were undertaken as part of this thesis:
-
-                             1. The development of a two-stage cascade system for liver and liver tumor segmentation in CT images (Chapter 2);
-                             2. The development of an ensemble of three orthogonal 2D CNNs for liver segmentation in late-phase T1W MRI images (Chapter 3);
-                             3. The investigation of various active learning strategies to optimally select a set of CT slices to obtain the best possible liver segmentation method in CT without the need to manually annotate a large amount of training data (Chapter 4);
-                             4. The development of a novel multi-planar 3D anisotropic CNN architecture for prostate segmentation in multi-planar T2W MRI images (Chapter 5).},
+                               It aims not only to propose and evaluate new segmentation architectures, but also to investigate aspects such as the required time for the correction of automatic segmentation results, the impact on the inter-observer variability, and the optimization of annotation effort.
+                               The following objectives were undertaken as part of this thesis:
+  
+                               1. The development of a two-stage cascade system for liver and liver tumor segmentation in CT images (Chapter 2);
+                               2. The development of an ensemble of three orthogonal 2D CNNs for liver segmentation in late-phase T1W MRI images (Chapter 3);
+                               3. The investigation of various active learning strategies to optimally select a set of CT slices to obtain the best possible liver segmentation method in CT without the need to manually annotate a large amount of training data (Chapter 4);
+                               4. The development of a novel multi-planar 3D anisotropic CNN architecture for prostate segmentation in multi-planar T2W MRI images (Chapter 5).},
   copromotor = {H. Meine and A. Schenk},
   file = {Chle22.pdf:pdf/Chle22.pdf:PDF},
   journal = {PhD thesis},
@@ -5785,29 +5809,29 @@ @article{Chun18
   pages = {e0191874},
   doi = {10.1371/journal.pone.0191874},
   abstract = {PURPOSE: To evaluate whether, and to which extent, experienced radiologists are
-                                                       able to visually correctly differentiate transient from persistent subsolid
-                                                       nodules from a single CT examination alone and to determine CT morphological
-                                                       features to make this differentiation.
-                                                       MATERIALS AND METHODS: We selected 86 transient and 135 persistent subsolid
-                                                       nodules from the National Lung Screening Trial (NLST) database. Four experienced
-                                                       radiologists visually assessed a predefined list of morphological features and
-                                                       gave a final judgment on a continuous scale (0-100). To assess observer
-                                                       performance, area under the receiver operating characteristic (ROC) curve was
-                                                       calculated. Statistical differences of morphological features between transient
-                                                       and persistent lesions were calculated using Chi-square. Inter-observer agreement
-                                                       of morphological features was evaluated by percentage agreement.
-                                                       RESULTS: Forty-nine lesions were excluded by at least 2 observers, leaving 172
-                                                       lesions for analysis. On average observers were able to differentiate transient
-                                                       from persistent subsolid nodules >= 10 mm with an area under the curve of 0.75
-                                                       (95% CI 0.67-0.82). Nodule type, lesion margin, presence of a well-defined
-                                                       border, and pleural retraction showed significant differences between transient
-                                                       and persistent lesions in two observers. Average pair-wise percentage agreement
-                                                       for these features was 81%, 64%, 47% and 89% respectively. Agreement for other
-                                                       morphological features varied from 53% to 95%.
-                                                       CONCLUSION: The visual capacity of experienced radiologists to differentiate
-                                                       persistent and transient subsolid nodules is moderate in subsolid nodules larger
-                                                       than 10 mm. Performance of the visual assessment of CT morphology alone is not
-                                                       sufficient to generally abandon a short-term follow-up for subsolid nodules.},
+                                                         able to visually correctly differentiate transient from persistent subsolid
+                                                         nodules from a single CT examination alone and to determine CT morphological
+                                                         features to make this differentiation.
+                                                         MATERIALS AND METHODS: We selected 86 transient and 135 persistent subsolid
+                                                         nodules from the National Lung Screening Trial (NLST) database. Four experienced
+                                                         radiologists visually assessed a predefined list of morphological features and
+                                                         gave a final judgment on a continuous scale (0-100). To assess observer
+                                                         performance, area under the receiver operating characteristic (ROC) curve was
+                                                         calculated. Statistical differences of morphological features between transient
+                                                         and persistent lesions were calculated using Chi-square. Inter-observer agreement
+                                                         of morphological features was evaluated by percentage agreement.
+                                                         RESULTS: Forty-nine lesions were excluded by at least 2 observers, leaving 172
+                                                         lesions for analysis. On average observers were able to differentiate transient
+                                                         from persistent subsolid nodules >= 10 mm with an area under the curve of 0.75
+                                                         (95% CI 0.67-0.82). Nodule type, lesion margin, presence of a well-defined
+                                                         border, and pleural retraction showed significant differences between transient
+                                                         and persistent lesions in two observers. Average pair-wise percentage agreement
+                                                         for these features was 81%, 64%, 47% and 89% respectively. Agreement for other
+                                                         morphological features varied from 53% to 95%.
+                                                         CONCLUSION: The visual capacity of experienced radiologists to differentiate
+                                                         persistent and transient subsolid nodules is moderate in subsolid nodules larger
+                                                         than 10 mm. Performance of the visual assessment of CT morphology alone is not
+                                                         sufficient to generally abandon a short-term follow-up for subsolid nodules.},
   file = {Chun18.pdf:pdf/Chun18.pdf:PDF},
   optnote = {DIAG},
   pmid = {29438443},
@@ -6507,9 +6531,9 @@ @conference{Dama23
   booktitle = {MIDL},
   title = {On the robustness of regressing tumor percentage as an explainable detector in histopathology whole-slide images},
   abstract = {In recent years, Multiple Instance Learning (MIL) approaches have gained popularity to address the task of weakly-supervised tumor detection in whole-slide images (WSIs).
-                          However, standard MIL relies on classification methods for tumor detection that require negative control, i.e., tumor-free cases, which are challenging to obtain in real-world clinical scenarios, especially when considering surgical resection specimens.
-                          Inspired by recent work, in this paper we tackle tumor detection via a MIL-like weakly-supervised regression approach to predict the percentage of tumor present in WSIs, a clinically available target that allows to overcome the problem of need for manual annotations or presence of tumor-free slides.
-                          We characterize the quality of such a target by investigating its robustness in the presence of noise on regression percentages and provide explainability through attention maps. We test our approach on breast cancer data from primary tumor and lymph node metastases.},
+                            However, standard MIL relies on classification methods for tumor detection that require negative control, i.e., tumor-free cases, which are challenging to obtain in real-world clinical scenarios, especially when considering surgical resection specimens.
+                            Inspired by recent work, in this paper we tackle tumor detection via a MIL-like weakly-supervised regression approach to predict the percentage of tumor present in WSIs, a clinically available target that allows to overcome the problem of need for manual annotations or presence of tumor-free slides.
+                            We characterize the quality of such a target by investigating its robustness in the presence of noise on regression percentages and provide explainability through attention maps. We test our approach on breast cancer data from primary tumor and lymph node metastases.},
   optnote = {DIAG, PATHOLOGY},
   year = {2023},
 }
@@ -6999,10 +7023,10 @@ @conference{Dinn24
   booktitle = ECR,
   title = {External validation of an AI algorithm for pulmonary nodule malignancy risk estimation on a dataset of incidentally detected pulmonary nodules},
   abstract = {Purpose: An AI algorithm for malignancy risk estimation was developed and validated on screen-detected pulmonary nodules. We aimed to test the AI algorithm in clinical data and compare the results to the Brock model.
-               Methods and materials: A size-matched dataset of solid incidentally detected pulmonary nodules with a diameter range between 5-15 mm was collected, consisting of 53 malignant nodules from CT scans performed at least two months prior to a lung cancer diagnosis, and 53 benign nodules. Differences in patient and nodule characteristics between the malignant and benign groups were assessed. AUCs and 95% confidence intervals were determined and compared using the DeLong method. Sensitivity and specificity were determined at a 10% malignancy risk threshold for the AI algorithm and Brock model, according to the British Thoracic Society guidelines.
-               Results: No statistical difference in size was detected between the malignant and benign nodules (median [range]: 10.8 [5.8, 15.4]; 10.4 [5.8, 15.1]; respectively). Cases with malignant nodules had a significantly lower number of nodules (p=0.001). The AI algorithm significantly outperformed the Brock model (p<0.001). AUC [95% CI] of the AI algorithm and Brock model were 0.87 [0.80-0.94] and 0.59 [0.48-0.69], respectively. The AI algorithm had a higher sensitivity (0.60 [0.46-0.74]) and specificity (0.87 [0.75-0.95]) than the Brock model (0.42 [0.28-0.56]; 0.75 [0.62-0.86]; respectively).
-               Conclusion: The AI algorithm outperformed the Brock model in a clinical dataset with a more heterogeneous population than a screening population. The AI algorithm demonstrated the potential for nodule risk stratification in a clinical setting, which can aid clinicians in decisions in nodule management, thereby potentially reducing unnecessary follow-up.
-               Limitations: This is a retrospective validation on a single-centre dataset. More research is needed to test the performance in larger and multi-centre data.},
+                 Methods and materials: A size-matched dataset of solid incidentally detected pulmonary nodules with a diameter range between 5-15 mm was collected, consisting of 53 malignant nodules from CT scans performed at least two months prior to a lung cancer diagnosis, and 53 benign nodules. Differences in patient and nodule characteristics between the malignant and benign groups were assessed. AUCs and 95% confidence intervals were determined and compared using the DeLong method. Sensitivity and specificity were determined at a 10% malignancy risk threshold for the AI algorithm and Brock model, according to the British Thoracic Society guidelines.
+                 Results: No statistical difference in size was detected between the malignant and benign nodules (median [range]: 10.8 [5.8, 15.4]; 10.4 [5.8, 15.1]; respectively). Cases with malignant nodules had a significantly lower number of nodules (p=0.001). The AI algorithm significantly outperformed the Brock model (p<0.001). AUC [95% CI] of the AI algorithm and Brock model were 0.87 [0.80-0.94] and 0.59 [0.48-0.69], respectively. The AI algorithm had a higher sensitivity (0.60 [0.46-0.74]) and specificity (0.87 [0.75-0.95]) than the Brock model (0.42 [0.28-0.56]; 0.75 [0.62-0.86]; respectively).
+                 Conclusion: The AI algorithm outperformed the Brock model in a clinical dataset with a more heterogeneous population than a screening population. The AI algorithm demonstrated the potential for nodule risk stratification in a clinical setting, which can aid clinicians in decisions in nodule management, thereby potentially reducing unnecessary follow-up.
+                 Limitations: This is a retrospective validation on a single-centre dataset. More research is needed to test the performance in larger and multi-centre data.},
   optnote = {DIAG, RADIOLOGY},
   year = {2024},
 }
@@ -7031,21 +7055,21 @@ @article{Donn19
   url = {http://dx.doi.org/10.1093/cid/ciz1008},
   volume = {71},
   abstract = {Abstract
-
-                                           Background
-                                           Invasive fungal diseases (IFDs) remain important causes of morbidity and mortality. The consensus definitions of the Infectious Diseases Group of the European Organization for Research and Treatment of Cancer and the Mycoses Study Group have been of immense value to researchers who conduct clinical trials of antifungals, assess diagnostic tests, and undertake epidemiologic studies. However, their utility has not extended beyond patients with cancer or recipients of stem cell or solid organ transplants. With newer diagnostic techniques available, it was clear that an update of these definitions was essential.
-
-
-                                           Methods
-                                           To achieve this, 10 working groups looked closely at imaging, laboratory diagnosis, and special populations at risk of IFD. A final version of the manuscript was agreed upon after the groups' findings were presented at a scientific symposium and after a 3-month period for public comment. There were several rounds of discussion before a final version of the manuscript was approved.
-
-
-                                           Results
-                                           There is no change in the classifications of "proven," "probable," and "possible" IFD, although the definition of "probable" has been expanded and the scope of the category "possible" has been diminished. The category of proven IFD can apply to any patient, regardless of whether the patient is immunocompromised. The probable and possible categories are proposed for immunocompromised patients only, except for endemic mycoses.
-
-
-                                           Conclusions
-                                           These updated definitions of IFDs should prove applicable in clinical, diagnostic, and epidemiologic research of a broader range of patients at high-risk.},
+  
+                                             Background
+                                             Invasive fungal diseases (IFDs) remain important causes of morbidity and mortality. The consensus definitions of the Infectious Diseases Group of the European Organization for Research and Treatment of Cancer and the Mycoses Study Group have been of immense value to researchers who conduct clinical trials of antifungals, assess diagnostic tests, and undertake epidemiologic studies. However, their utility has not extended beyond patients with cancer or recipients of stem cell or solid organ transplants. With newer diagnostic techniques available, it was clear that an update of these definitions was essential.
+  
+  
+                                             Methods
+                                             To achieve this, 10 working groups looked closely at imaging, laboratory diagnosis, and special populations at risk of IFD. A final version of the manuscript was agreed upon after the groups' findings were presented at a scientific symposium and after a 3-month period for public comment. There were several rounds of discussion before a final version of the manuscript was approved.
+  
+  
+                                             Results
+                                             There is no change in the classifications of "proven," "probable," and "possible" IFD, although the definition of "probable" has been expanded and the scope of the category "possible" has been diminished. The category of proven IFD can apply to any patient, regardless of whether the patient is immunocompromised. The probable and possible categories are proposed for immunocompromised patients only, except for endemic mycoses.
+  
+  
+                                             Conclusions
+                                             These updated definitions of IFDs should prove applicable in clinical, diagnostic, and epidemiologic research of a broader range of patients at high-risk.},
   all_ss_ids = {[7284993f1815bf7cc15af7dbc735594d33cbabb6]},
   automatic = {yes},
   citation-count = {1190},
@@ -7175,8 +7199,8 @@ @conference{Eeke22
   booktitle = {ECP},
   year = {2022},
   abstract = {Artificial intelligence (AI) based quantification of cell-level PD-L1 status enables spatial analysis and allows reliable and reproducible assessment of the tumor proportion score. In this study, we assess the cell-level inter-pathologist agreement as human benchmark for AI development and validation. Three pathologists manually annotated the centers of all nuclei within 53 regions of interest in 12 whole- slide images (40X magnification) of NSCLC cases and classified them as PD-L1 negative/positive tumor cells, PD-L1 positive immune cells or other cells. Agreement was quantified using F1 score analysis, with agreement defined as annotations less than 10 um apart and of the same class. An average of 9044 nuclei (1550 negative, 2367 positive tumor cells, 1244 positive immune cells, 3881 other cells) were manually annotated by the three pathologists. The mean F1 score over pairs of pathologists at dataset level was 0.59 (range 0.54-0.65). When split across classes, the mean per-pair F1 scores stay approximately the same, indicating the readers perform similarly regardless of cell type. Besides human variability in manual point annotations with respect to the center of nuclei, lack of context contributed to disagreement: readers who reported they solely examined the ROIs tended to disagree more with readers that reported they also looked outside the ROIs for additional (morphological/density) information.
-
-                          In conclusion, agreement on determining the PD-L1 status of individual cells is only moderate, suggesting a role for AI. By quantifying the inter-rater agreement of pathologists, we have created a human benchmark which may serve as an upper bound (and could be combined via majority vote) for the validation of AI at celllevel, something not done previously. Cell-level AI-based assessment of PD-L1 may supersede slide level scoring, adding significant information on the heterogeneity and spatial distribution over the tumor.},
+  
+                            In conclusion, agreement on determining the PD-L1 status of individual cells is only moderate, suggesting a role for AI. By quantifying the inter-rater agreement of pathologists, we have created a human benchmark which may serve as an upper bound (and could be combined via majority vote) for the validation of AI at celllevel, something not done previously. Cell-level AI-based assessment of PD-L1 may supersede slide level scoring, adding significant information on the heterogeneity and spatial distribution over the tumor.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -7186,10 +7210,10 @@ @conference{Eeke22a
   booktitle = {ECP},
   year = {2022},
   abstract = {Nuclei detection in histopathology images is an important prerequisite step of downstream research and clinical analyses, such as counting cells and spatial interactions. In this study, we developed an AI-based nuclei detector using the YOLOv5 framework in whole-slide NSCLC cases. Our dataset consisted of 42 PD-L1 stained cases (30 training, 12 test). Four trained (non-expert) readers manually annotated all nuclei (both positive/negative) within regions of interest (ROIs) viewed at 40X magnification. We trained a YOLOv5(s) network on annotations of one reader. Performance was measured using F1 score analysis; hits were defined as being less than 10 um away from annotations.
-
-                          We evaluate YOLOv5 on the test set by pairing it against all four readers separately. There, YOLOv5 performs excellently, falling within the interrater variability of the four readers: the mean F1 score over algorithm-reader pairs is 0.84 (range 0.76-0.92) while the mean F1 score over pairs of readers is 0.82 (range 0.76-0.86). When we determine the cell count (number of annotations/predictions) per ROI in the test set, agreement of algorithm-reader pairs and reader pairs is equally well aligned: 0.93 (range 0.90-0.97) versus 0.94 (range 0.92-0.96). Visual inspection indicates YOLOv5 performs equally well on PD-L1 positive and negative cells.
-
-                          In future work, we could extend this detector to additional tissues and immunohistochemistry stainings. Moreover, this detector could be used as a AI-assisted manual point annotation tool: while human readers perform the (context-driven) task of delineating homogeneous regions (e.g. clusters of PD-L1positive stained cells), the detector performs the (local, yet laborious) task of identifying individual nuclei within these regions, providing labelled point annotations.},
+  
+                            We evaluate YOLOv5 on the test set by pairing it against all four readers separately. There, YOLOv5 performs excellently, falling within the interrater variability of the four readers: the mean F1 score over algorithm-reader pairs is 0.84 (range 0.76-0.92) while the mean F1 score over pairs of readers is 0.82 (range 0.76-0.86). When we determine the cell count (number of annotations/predictions) per ROI in the test set, agreement of algorithm-reader pairs and reader pairs is equally well aligned: 0.93 (range 0.90-0.97) versus 0.94 (range 0.92-0.96). Visual inspection indicates YOLOv5 performs equally well on PD-L1 positive and negative cells.
+  
+                            In future work, we could extend this detector to additional tissues and immunohistochemistry stainings. Moreover, this detector could be used as a AI-assisted manual point annotation tool: while human readers perform the (context-driven) task of delineating homogeneous regions (e.g. clusters of PD-L1positive stained cells), the detector performs the (local, yet laborious) task of identifying individual nuclei within these regions, providing labelled point annotations.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -7233,9 +7257,9 @@ @conference{Eerd18
   booktitle = ECR,
   year = {2018},
   abstract = {Purpose: To evaluate the performance of computer-aided traumatic cerebral microbleed detection (CAD) with and without human interference.
-                                                       Methods and Materials: 33 adult patients admitted to our emergency department with moderate or severe TBI (mean age 33 years, 21 males) underwent a standardized trauma 3T MRI-protocol at 28 weeks. The microbleeds in their SWI-scans were annotated by an expert. A CAD system was developed, based on this training set. Six experts, blind to the CAD-results, annotated a subset of ten patients. In two experiments, we compared the performance of the CAD system to each of these six experts, using the majority voting results of the other five experts as the reference standard for the calculation of performance characteristics (paired t-test). In the first experiment, the performance of fully automatic microbleed detection was assessed. In the second experiment, one expert removed CAD-annotations she considered false positives from the automatically detected microbleeds, and briefly screened the CAD-annotated SWI-scans to complete the dataset with missed definite microbleeds.
-                                                       Results: Fully manual evaluation took one hour per patient with an average sensitivity of 77% (SD 12.4%). The sensitivity of fully automatic detection of candidate microbleeds was 89% (SD 0.8%). Evaluation of the CAD results by an expert took 13 minutes per patient with a sensitivity of 93% (SD 1.0%) (p < 0.05 versus fully manual evaluation).
-                                                       Conclusion: This CAD system allows detecting more microbleeds in a reduced reading time. This may facilitate the execution of otherwise too time-consuming large studies on the clinical relevance of microbleeds.},
+                                                         Methods and Materials: 33 adult patients admitted to our emergency department with moderate or severe TBI (mean age 33 years, 21 males) underwent a standardized trauma 3T MRI-protocol at 28 weeks. The microbleeds in their SWI-scans were annotated by an expert. A CAD system was developed, based on this training set. Six experts, blind to the CAD-results, annotated a subset of ten patients. In two experiments, we compared the performance of the CAD system to each of these six experts, using the majority voting results of the other five experts as the reference standard for the calculation of performance characteristics (paired t-test). In the first experiment, the performance of fully automatic microbleed detection was assessed. In the second experiment, one expert removed CAD-annotations she considered false positives from the automatically detected microbleeds, and briefly screened the CAD-annotated SWI-scans to complete the dataset with missed definite microbleeds.
+                                                         Results: Fully manual evaluation took one hour per patient with an average sensitivity of 77% (SD 12.4%). The sensitivity of fully automatic detection of candidate microbleeds was 89% (SD 0.8%). Evaluation of the CAD results by an expert took 13 minutes per patient with a sensitivity of 93% (SD 1.0%) (p < 0.05 versus fully manual evaluation).
+                                                         Conclusion: This CAD system allows detecting more microbleeds in a reduced reading time. This may facilitate the execution of otherwise too time-consuming large studies on the clinical relevance of microbleeds.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -7245,29 +7269,29 @@ @article{Eerd21
   doi = {10.3174/ajnr.a7028},
   year = {2021},
   abstract = {BACKGROUND AND PURPOSE: In the chronic phase after traumatic brain injury, DTI findings reflect WM integrity. DTI interpretation in the subacute phase is less straightforward. Microbleed evaluation with SWI is straightforward in both phases. We evaluated whether the microbleed concentration in the subacute phase is associated with the integrity of normal-appearing WM in the chronic phase. MATERIALS AND METHODS: Sixty of 211 consecutive patients 18 years of age or older admitted to our emergency department <=24 hours after moderate to severe traumatic brain injury matched the selection criteria. Standardized 3T SWI, DTI, and T1WI were obtained 3 and 26 weeks after traumatic brain injury in 31 patients and 24 healthy volunteers. At baseline, microbleed concentrations were calculated. At follow-up, mean diffusivity (MD) was calculated in the normal-appearing WM in reference to the healthy volunteers (MDz). Through linear regression, we evaluated the relation between microbleed concentration and MDz in predefined structures. RESULTS: In the cerebral hemispheres, MDz at follow-up was independently associated with the microbleed concentration at baseline (left: B = 38.4 [95% CI 7.5-69.3], P = .017; right: B = 26.3 [95% CI 5.7-47.0], P = .014). No such relation was demonstrated in the central brain. MDz in the corpus callosum was independently associated with the microbleed concentration in the structures connected by WM tracts running through the corpus callosum (B = 20.0 [95% CI 24.8-75.2], P < .000). MDz in the central brain was independently associated with the microbleed concentration in the cerebral hemispheres (B = 25.7 [95% CI 3.9-47.5], P = .023). CONCLUSIONS: SWI-assessed microbleeds in the subacute phase are associated with DTI-based WM integrity in the chronic phase. These associations are found both within regions and between functionally connected regions. B
-                         :   linear regression coefficient
-                         Bcmb-conc
-                         :   linear regression coefficient with microbleed concentration as independent variable
-                         Bcmb-nr
-                         :   linear regression coefficient with microbleed number as independent variable
-                         MD
-                         :   mean diffusivity
-                         MDz
-                         :   Z -score of mean diffusivity, normalized to the healthy control participants
-                         t1
-                         :   3 (2-5) weeks after TBI
-                         t2
-                         :   26 (25-28) weeks after TBI
-                         TAI
-                         :   traumatic axonal injury
-                         TBI
-                         :   traumatic brain injury
-                         FA
-                         :   fractional anisotropy
-                         MARS
-                         :   Microbleed Anatomical Rating Scale
-                         GCS
-                         :   Glasgow Coma Scale},
+                           :   linear regression coefficient
+                           Bcmb-conc
+                           :   linear regression coefficient with microbleed concentration as independent variable
+                           Bcmb-nr
+                           :   linear regression coefficient with microbleed number as independent variable
+                           MD
+                           :   mean diffusivity
+                           MDz
+                           :   Z -score of mean diffusivity, normalized to the healthy control participants
+                           t1
+                           :   3 (2-5) weeks after TBI
+                           t2
+                           :   26 (25-28) weeks after TBI
+                           TAI
+                           :   traumatic axonal injury
+                           TBI
+                           :   traumatic brain injury
+                           FA
+                           :   fractional anisotropy
+                           MARS
+                           :   Microbleed Anatomical Rating Scale
+                           GCS
+                           :   Glasgow Coma Scale},
   url = {http://dx.doi.org/10.3174/ajnr.A7028},
   file = {Eerd21.pdf:pdf\\Eerd21.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -7289,16 +7313,16 @@ @article{Eerd21a
   url = {http://dx.doi.org/10.1007/s00234-021-02839-z},
   volume = {64},
   abstract = {Abstract                 Introduction
-                                         In order to augment the certainty of the radiological interpretation of "possible microbleeds" after traumatic brain injury (TBI), we assessed their longitudinal evolution on 3-T SWI in patients with moderate/severe TBI.
-
-                                         Methods
-                                         Standardized 3-T SWI and T1-weighted imaging were obtained 3 and 26 weeks after TBI in 31 patients. Their microbleeds were computer-aided detected and classified by a neuroradiologist as no, possible, or definite at baseline and follow-up, separately (single-scan evaluation). Thereafter, the classifications were re-evaluated after comparison between the time-points (post-comparison evaluation). We selected the possible microbleeds at baseline at single-scan evaluation and recorded their post-comparison classification at follow-up.
-
-                                         Results
-                                         Of the 1038 microbleeds at baseline, 173 were possible microbleeds. Of these, 53.8% corresponded to no microbleed at follow-up. At follow-up, 30.6% were possible and 15.6% were definite. Of the 120 differences between baseline and follow-up, 10% showed evidence of a pathophysiological change over time. Proximity to extra-axial injury and proximity to definite microbleeds were independently predictive of becoming a definite microbleed at follow-up. The reclassification level differed between anatomical locations.
-
-                                         Conclusions
-                                         Our findings support disregarding possible microbleeds in the absence of clinical consequences. In selected cases, however, a follow-up SWI-scan could be considered to exclude evolution into a definite microbleed.},
+                                           In order to augment the certainty of the radiological interpretation of "possible microbleeds" after traumatic brain injury (TBI), we assessed their longitudinal evolution on 3-T SWI in patients with moderate/severe TBI.
+  
+                                           Methods
+                                           Standardized 3-T SWI and T1-weighted imaging were obtained 3 and 26 weeks after TBI in 31 patients. Their microbleeds were computer-aided detected and classified by a neuroradiologist as no, possible, or definite at baseline and follow-up, separately (single-scan evaluation). Thereafter, the classifications were re-evaluated after comparison between the time-points (post-comparison evaluation). We selected the possible microbleeds at baseline at single-scan evaluation and recorded their post-comparison classification at follow-up.
+  
+                                           Results
+                                           Of the 1038 microbleeds at baseline, 173 were possible microbleeds. Of these, 53.8% corresponded to no microbleed at follow-up. At follow-up, 30.6% were possible and 15.6% were definite. Of the 120 differences between baseline and follow-up, 10% showed evidence of a pathophysiological change over time. Proximity to extra-axial injury and proximity to definite microbleeds were independently predictive of becoming a definite microbleed at follow-up. The reclassification level differed between anatomical locations.
+  
+                                           Conclusions
+                                           Our findings support disregarding possible microbleeds in the absence of clinical consequences. In selected cases, however, a follow-up SWI-scan could be considered to exclude evolution into a definite microbleed.},
   all_ss_ids = {[92a2cb9804d3599cc1e8e81c512d0203c6d10da7]},
   automatic = {yes},
   citation-count = {1},
@@ -7726,12 +7750,12 @@ @conference{Enge19
   booktitle = ARVO,
   title = {Automatic Segmentation of Drusen and Exudates on Color Fundus Images using Generative Adversarial Networks},
   abstract = {Purpose: The presence of drusen and exudates, visible as bright lesions on color fundus images, is one of the early signs of visual threatening diseases such as Age-related Macular Degeneration and Diabetic Retinopathy. Accurate detection and quantification of these lesions during screening can help identify patients that would benefit from treatment. We developed a method based on generative adversarial networks (GANs) to segment bright lesions on color fundus images.
-
-                                                       Methods: We used 4179 color fundus images that were acquired during clinical routine. The images were contrast enhanced to increase the contrast between bright lesions and the background. All bright lesions were manually annotated by marking the center point of the lesions. The GAN was trained to estimate the image without bright lesions. The final segmentation was obtained by taking the difference between the input image and the estimated output.
-
-                                                       Results: This method was applied to an independent test set of 52 color fundus images with non-advanced stages of AMD from the European Genetic Database, which were fully segmented for bright lesions by two trained human observers. The method achieved Dice scores of 0.4862 and 0.4849 when compared to the observers, whereas the inter-observer Dice score was 0.5043. The total segmented bright lesion area per image was evaluated using the intraclass correlation (ICC). The method scored 0.8537 and 0.8352 when compared to the observers, whereas the inter-observer ICC was 0.8893.
-
-                                                       Conclusions: The results show the performance is close to the agreement between trained observers. This automatic segmentation of bright lesions can help early diagnosis of visual threatening diseases and opens the way for large scale clinical trials.},
+  
+                                                         Methods: We used 4179 color fundus images that were acquired during clinical routine. The images were contrast enhanced to increase the contrast between bright lesions and the background. All bright lesions were manually annotated by marking the center point of the lesions. The GAN was trained to estimate the image without bright lesions. The final segmentation was obtained by taking the difference between the input image and the estimated output.
+  
+                                                         Results: This method was applied to an independent test set of 52 color fundus images with non-advanced stages of AMD from the European Genetic Database, which were fully segmented for bright lesions by two trained human observers. The method achieved Dice scores of 0.4862 and 0.4849 when compared to the observers, whereas the inter-observer Dice score was 0.5043. The total segmented bright lesion area per image was evaluated using the intraclass correlation (ICC). The method scored 0.8537 and 0.8352 when compared to the observers, whereas the inter-observer ICC was 0.8893.
+  
+                                                         Conclusions: The results show the performance is close to the agreement between trained observers. This automatic segmentation of bright lesions can help early diagnosis of visual threatening diseases and opens the way for large scale clinical trials.},
   optnote = {DIAG, RADIOLOGY},
   year = {2019},
   all_ss_ids = {[34559bb0d95c5166625945eef9b53b21a30838fa]},
@@ -8151,12 +8175,12 @@ @article{Four21
   url = {http://dx.doi.org/10.1007/s00330-020-07598-8},
   volume = {31},
   abstract = {Abstract
-                                         Existing quantitative imaging biomarkers (QIBs) are associated with known biological tissue characteristics and follow a well-understood path of technical, biological and clinical validation before incorporation into clinical trials. In radiomics, novel data-driven processes extract numerous visually imperceptible statistical features from the imaging data with no a priori assumptions on their correlation with biological processes. The selection of relevant features (radiomic signature) and incorporation into clinical trials therefore requires additional considerations to ensure meaningful imaging endpoints. Also, the number of radiomic features tested means that power calculations would result in sample sizes impossible to achieve within clinical trials. This article examines how the process of standardising and validating data-driven imaging biomarkers differs from those based on biological associations. Radiomic signatures are best developed initially on datasets that represent diversity of acquisition protocols as well as diversity of disease and of normal findings, rather than within clinical trials with standardised and optimised protocols as this would risk the selection of radiomic features being linked to the imaging process rather than the pathology. Normalisation through discretisation and feature harmonisation are essential pre-processing steps. Biological correlation may be performed after the technical and clinical validity of a radiomic signature is established, but is not mandatory. Feature selection may be part of discovery within a radiomics-specific trial or represent exploratory endpoints within an established trial; a previously validated radiomic signature may even be used as a primary/secondary endpoint, particularly if associations are demonstrated with specific biological processes and pathways being targeted within clinical trials.
-
-                                         Key Points
-                                         * Data-driven processes like radiomics risk false discoveries due to high-dimensionality of the dataset compared to sample size, making adequate diversity of the data, cross-validation and external validation essential to mitigate the risks of spurious associations and overfitting.
-                                         * Use of radiomic signatures within clinical trials requires multistep standardisation of image acquisition, image analysis and data mining processes.
-                                         * Biological correlation may be established after clinical validation but is not mandatory.},
+                                           Existing quantitative imaging biomarkers (QIBs) are associated with known biological tissue characteristics and follow a well-understood path of technical, biological and clinical validation before incorporation into clinical trials. In radiomics, novel data-driven processes extract numerous visually imperceptible statistical features from the imaging data with no a priori assumptions on their correlation with biological processes. The selection of relevant features (radiomic signature) and incorporation into clinical trials therefore requires additional considerations to ensure meaningful imaging endpoints. Also, the number of radiomic features tested means that power calculations would result in sample sizes impossible to achieve within clinical trials. This article examines how the process of standardising and validating data-driven imaging biomarkers differs from those based on biological associations. Radiomic signatures are best developed initially on datasets that represent diversity of acquisition protocols as well as diversity of disease and of normal findings, rather than within clinical trials with standardised and optimised protocols as this would risk the selection of radiomic features being linked to the imaging process rather than the pathology. Normalisation through discretisation and feature harmonisation are essential pre-processing steps. Biological correlation may be performed after the technical and clinical validity of a radiomic signature is established, but is not mandatory. Feature selection may be part of discovery within a radiomics-specific trial or represent exploratory endpoints within an established trial; a previously validated radiomic signature may even be used as a primary/secondary endpoint, particularly if associations are demonstrated with specific biological processes and pathways being targeted within clinical trials.
+  
+                                           Key Points
+                                           * Data-driven processes like radiomics risk false discoveries due to high-dimensionality of the dataset compared to sample size, making adequate diversity of the data, cross-validation and external validation essential to mitigate the risks of spurious associations and overfitting.
+                                           * Use of radiomic signatures within clinical trials requires multistep standardisation of image acquisition, image analysis and data mining processes.
+                                           * Biological correlation may be established after clinical validation but is not mandatory.},
   all_ss_ids = {[c859bd469080b82dc14db62e78d65ef5b5ffa686]},
   automatic = {yes},
   citation-count = {45},
@@ -8191,9 +8215,9 @@ @conference{Fransen22
   booktitle = RSNA,
   title = {Diagnostic AI to speed up MRI protocols by identifying redundant sequences: are all diffusion-weighted prostate MRI sequences necessary?},
   abstract = {PURPOSE: To explore if an expert-level diagnostic AI can help speed up MRI by identifying redundant sequences in diffusion-weighted imaging (DWI) for the diagnostic detection of clinically significant prostate cancer (csPCa).
-                          MATERIALS AND METHODS: Existing deep learning AI architectures detects csPCa based on bpMRI at expert-level. We developed a method in which this AI assesses the added diagnostic value of a sequence. This retrospective study included 840 patients with a bi-parametric prostate MRI (bpMRI) for initial detection of csPCa. The bmMRI comprises a T2-weighted image and DWI with b-values of 50, 400, and 800 s/mm2 on a 3T scanner (Skyra and Prima). Our method entails computing ADC and b1400 maps based on different DWI combinations: 1) b800 excluded, 2) b400 excluded, 3) complete set. AI models for the various bpMRI combination were trained 5-fold and statistically compared with receiver operating curve (ROC) analysis at patient and lesion level using respectively the DeLong's and permutation test.
-                          RESULTS: The mean area under the ROC of the three combinations were respectively 0.78 +-0.027 (SD), 0.76 +-0.051, and 0.77 +- 0.057. The partial area under of the free ROC between 0.1 and 2.5 false positives lesions per patient was respectively 1.44 +- 0.22, 1.58 +- 0.18 and 1.50 +- 0.12. The slight difference in diagnostic performance (patient-level 0.01, lesion-level 0.06 ) when omitting sequence DWI b800 is not significant (respectively p = 0.2 and p = 0.43).
-                          CONCLUSION: We conclude that expert-level AI can identify redundant sequences in MRI. Furthermore, our method provides evidence that in DWI for csPCa detection, the b800 series can be omitted from the regular bpMRI protocol decreasing total MRI scan time by 33\%. These results can provide significant speed-up of any MRI.},
+                            MATERIALS AND METHODS: Existing deep learning AI architectures detects csPCa based on bpMRI at expert-level. We developed a method in which this AI assesses the added diagnostic value of a sequence. This retrospective study included 840 patients with a bi-parametric prostate MRI (bpMRI) for initial detection of csPCa. The bmMRI comprises a T2-weighted image and DWI with b-values of 50, 400, and 800 s/mm2 on a 3T scanner (Skyra and Prima). Our method entails computing ADC and b1400 maps based on different DWI combinations: 1) b800 excluded, 2) b400 excluded, 3) complete set. AI models for the various bpMRI combination were trained 5-fold and statistically compared with receiver operating curve (ROC) analysis at patient and lesion level using respectively the DeLong's and permutation test.
+                            RESULTS: The mean area under the ROC of the three combinations were respectively 0.78 +-0.027 (SD), 0.76 +-0.051, and 0.77 +- 0.057. The partial area under of the free ROC between 0.1 and 2.5 false positives lesions per patient was respectively 1.44 +- 0.22, 1.58 +- 0.18 and 1.50 +- 0.12. The slight difference in diagnostic performance (patient-level 0.01, lesion-level 0.06 ) when omitting sequence DWI b800 is not significant (respectively p = 0.2 and p = 0.43).
+                            CONCLUSION: We conclude that expert-level AI can identify redundant sequences in MRI. Furthermore, our method provides evidence that in DWI for csPCa detection, the b800 series can be omitted from the regular bpMRI protocol decreasing total MRI scan time by 33\%. These results can provide significant speed-up of any MRI.},
   optnote = {DIAG, RADIOLOGY},
   year = {2022},
 }
@@ -8713,7 +8737,7 @@ @article{Gees19
   doi = {10.1007/s13402-019-00429-z},
   abstract = {Purpose:Tumor-stroma ratio (TSR) serves as an independent prognostic factor in colorectal cancer and other solid malignancies. The recent introduction of digital pathology in routine tissue diagnostics holds opportunities for automated TSR analysis. We investigated the potential of computer-aided quantification of intratumoral stroma in rectal cancer whole-slide images. Methods: Histological slides from 129 rectal adenocarcinoma patients were analyzed by two experts who selected a suitable stroma hot-spot and visually assessed TSR. A semi-automatic method based on deep learning was trained to segment all relevant tissue types in rectal cancer histology and subsequently applied to the hot-spots provided by the experts. Patients were assigned to a 'stroma-high' or 'stroma-low' group by both TSR methods (visual and automated). This allowed for prognostic comparison between the two methods in terms of disease-specific and disease-free survival times. Results: With stroma-low as baseline, automated TSR was found to be prognostic independent of age, gender, pT-stage, lymph node status, tumor grade, and whether adjuvant therapy was given, both for disease-specific survival (hazard ratio = 2.48 (95% confidence interval 1.29-4.78)) and for disease-free survival (hazard ratio = 2.05 (95% confidence interval 1.11-3.78)). Visually assessed TSR did not serve as an independent prognostic factor in multivariate analysis. Conclusions: This work shows that TSR is an independent prognosticator in rectal cancer when assessed automatically in user-provided stroma hot-spots. The deep learning-based technology presented here may be a significant aid to pathologists in routine diagnostics.},
   file = {Gees19.pdf:pdf\\Gees19.pdf:PDF
-                                                         timestamp = (01-03-2019},
+                                                           timestamp = (01-03-2019},
   optnote = {DIAG, RADIOLOGY},
   pmid = {30825182},
   month = {3},
@@ -8903,16 +8927,16 @@ @article{Ghaf16a
   pages = {6246-6258},
   doi = {10.1118/1.4966029},
   abstract = {Purpose:
-                                                       White matter hyperintensities (WMH) are seen on FLAIR-MRI in several neurological disorders, including multiple sclerosis, dementia, Parkinsonism, stroke and cerebral small vessel disease (SVD). WMHs are often used as biomarkers for prognosis or disease progression in these diseases, and additionally longitudinal quantification of WMHs is used to evaluate therapeutic strategies. Human readers show considerable disagreement and inconsistency on detection of small lesions. A multitude of automated detection algorithms for WMHs exists, but since most of the current automated approaches are tuned to optimize segmentation performance according to Jaccard or Dice scores, smaller WMHs often go undetected in these approaches. In this paper, the authors propose a method to accurately detect all WMHs, large as well as small.
-
-                                                       Methods:
-                                                       A two-stage learning approach was used to discriminate WMHs from normal brain tissue. Since small and larger WMHs have quite a different appearance, the authors have trained two probabilistic classifiers: one for the small WMHs (<3 mm effective diameter) and one for the larger WMHs (>3 mm in-plane effective diameter). For each size-specific classifier, an Adaboost is trained for five iterations, with random forests as the basic classifier. The feature sets consist of 22 features including intensities, location information, blob detectors, and second order derivatives. The outcomes of the two first-stage classifiers were combined into a single WMH likelihood by a second-stage classifier. Their method was trained and evaluated on a dataset with MRI scans of 362 SVD patients (312 subjects for training and validation annotated by one and 50 for testing annotated by two trained raters). To analyze performance on the separate test set, the authors performed a free-response receiving operating characteristic (FROC) analysis, instead of using segmentation based methods that tend to ignore the contribution of small WMHs.
-
-                                                       Results:
-                                                       Experimental results based on FROC analysis demonstrated a close performance of the proposed computer aided detection (CAD) system to human readers. While an independent reader had 0.78 sensitivity with 28 false positives per volume on average, their proposed CAD system reaches a sensitivity of 0.73 with the same number of false positives.
-
-                                                       Conclusions:
-                                                       The authors have developed a CAD system with all its ingredients being optimized for a better detection of WMHs of all size, which shows performance close to an independent reader.},
+                                                         White matter hyperintensities (WMH) are seen on FLAIR-MRI in several neurological disorders, including multiple sclerosis, dementia, Parkinsonism, stroke and cerebral small vessel disease (SVD). WMHs are often used as biomarkers for prognosis or disease progression in these diseases, and additionally longitudinal quantification of WMHs is used to evaluate therapeutic strategies. Human readers show considerable disagreement and inconsistency on detection of small lesions. A multitude of automated detection algorithms for WMHs exists, but since most of the current automated approaches are tuned to optimize segmentation performance according to Jaccard or Dice scores, smaller WMHs often go undetected in these approaches. In this paper, the authors propose a method to accurately detect all WMHs, large as well as small.
+  
+                                                         Methods:
+                                                         A two-stage learning approach was used to discriminate WMHs from normal brain tissue. Since small and larger WMHs have quite a different appearance, the authors have trained two probabilistic classifiers: one for the small WMHs (<3 mm effective diameter) and one for the larger WMHs (>3 mm in-plane effective diameter). For each size-specific classifier, an Adaboost is trained for five iterations, with random forests as the basic classifier. The feature sets consist of 22 features including intensities, location information, blob detectors, and second order derivatives. The outcomes of the two first-stage classifiers were combined into a single WMH likelihood by a second-stage classifier. Their method was trained and evaluated on a dataset with MRI scans of 362 SVD patients (312 subjects for training and validation annotated by one and 50 for testing annotated by two trained raters). To analyze performance on the separate test set, the authors performed a free-response receiving operating characteristic (FROC) analysis, instead of using segmentation based methods that tend to ignore the contribution of small WMHs.
+  
+                                                         Results:
+                                                         Experimental results based on FROC analysis demonstrated a close performance of the proposed computer aided detection (CAD) system to human readers. While an independent reader had 0.78 sensitivity with 28 false positives per volume on average, their proposed CAD system reaches a sensitivity of 0.73 with the same number of false positives.
+  
+                                                         Conclusions:
+                                                         The authors have developed a CAD system with all its ingredients being optimized for a better detection of WMHs of all size, which shows performance close to an independent reader.},
   file = {Ghaf16a.pdf:pdf\\Ghaf16a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {27908171},
@@ -9041,10 +9065,10 @@ @article{Gibs17
   doi = {10.1016/j.media.2017.07.004},
   url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5666910/},
   abstract = {Segmentation algorithms are typically evaluated by comparison to an accepted reference standard. The cost of generating accurate reference standards for medical image segmentation can be substantial. Since the study cost and the likelihood of detecting a clinically meaningful difference in accuracy both depend on the size and on the quality of the study reference standard, balancing these trade-offs supports the efficient use of research resources.
-
-                                                       In this work, we derive a statistical power calculation that enables researchers to estimate the appropriate sample size to detect clinically meaningful differences in segmentation accuracy (i.e. the proportion of voxels matching the reference standard) between two algorithms. Furthermore, we derive a formula to relate reference standard errors to their effect on the sample sizes of studies using lower-quality (but potentially more affordable and practically available) reference standards.
-
-                                                       The accuracy of the derived sample size formula was estimated through Monte Carlo simulation, demonstrating, with 95% confidence, a predicted statistical power within 4% of simulated values across a range of model parameters. This corresponds to sample size errors of less than 4 subjects and errors in the detectable accuracy difference less than 0.6%. The applicability of the formula to real-world data was assessed using bootstrap resampling simulations for pairs of algorithms from the PROMISE12 prostate MR segmentation challenge data set. The model predicted the simulated power for the majority of algorithm pairs within 4% for simulated experiments using a high-quality reference standard and within 6% for simulated experiments using a low-quality reference standard. A case study, also based on the PROMISE12 data, illustrates using the formulae to evaluate whether to use a lower-quality reference standard in a prostate segmentation study.},
+  
+                                                         In this work, we derive a statistical power calculation that enables researchers to estimate the appropriate sample size to detect clinically meaningful differences in segmentation accuracy (i.e. the proportion of voxels matching the reference standard) between two algorithms. Furthermore, we derive a formula to relate reference standard errors to their effect on the sample sizes of studies using lower-quality (but potentially more affordable and practically available) reference standards.
+  
+                                                         The accuracy of the derived sample size formula was estimated through Monte Carlo simulation, demonstrating, with 95% confidence, a predicted statistical power within 4% of simulated values across a range of model parameters. This corresponds to sample size errors of less than 4 subjects and errors in the detectable accuracy difference less than 0.6%. The applicability of the formula to real-world data was assessed using bootstrap resampling simulations for pairs of algorithms from the PROMISE12 prostate MR segmentation challenge data set. The model predicted the simulated power for the majority of algorithm pairs within 4% for simulated experiments using a high-quality reference standard and within 6% for simulated experiments using a low-quality reference standard. A case study, also based on the PROMISE12 data, illustrates using the formulae to evaluate whether to use a lower-quality reference standard in a prostate segmentation study.},
   file = {Gibs17.pdf:pdf\\Gibs17.pdf:PDF},
   optnote = {DIAG},
   pmid = {28772163},
@@ -9062,7 +9086,7 @@ @inproceedings{Gibs18
   year = {2018},
   doi = {10.1007/978-3-030-00937-3_58},
   abstract = {Deep-learning-based segmentation tools have yielded higher reported segmentation accuracies for many medical imaging applications. However, inter-site variability in image properties can challenge the translation of these tools to data from 'unseen' sites not included in the training data. This study quantifies the impact of inter-site variability on the accuracy of deep-learning-based segmentations of the prostate from magnetic resonance (MR) images, and evaluates two strategies for mitigating the reduced accuracy for data from unseen sites: training on multi-site data and training with limited additional data from the unseen site. Using 376 T2-weighted prostate MR images from six sites, we compare the segmentation accuracy (Dice score and boundary distance) of
-                                                       three deep-learning-based networks trained on data from a single site and on various configurations of data from multiple sites. We found that the segmentation accuracy of a single-site network was substantially worse on data from unseen sites than on data from the training site. Training on multi-site data yielded marginally improved accuracy and robustness. However, including as few as 8 subjects from the unseen site, e.g. during commissioning of a new clinical system, yielded substantial improvement (regaining 75% of the difference in Dice score).},
+                                                         three deep-learning-based networks trained on data from a single site and on various configurations of data from multiple sites. We found that the segmentation accuracy of a single-site network was substantially worse on data from unseen sites than on data from the training site. Training on multi-site data yielded marginally improved accuracy and robustness. However, including as few as 8 subjects from the unseen site, e.g. during commissioning of a new clinical system, yielded substantial improvement (regaining 75% of the difference in Dice score).},
   file = {Gibs18.pdf:pdf\\Gibs18.pdf:PDF},
   optnote = {DIAG},
   gsid = {6478096673123844302},
@@ -9736,7 +9760,7 @@ @conference{Ginn13
   booktitle = RSNA,
   year = {2013},
   abstract = {BACKGROUND Lung diseases are among the most deadly disorders: chronic obstructive pulmonary disease ({COPD}), a devastating disease with 12 million people in the United States currently diagnosed, ranks #3 on the list of causes of death wordwide. Lung cancer, by far the most common and most deadly cancer in men and women worldwide, ranks #5. Tuberculosis ({TB}), despite the availability of a cheap and effective cure, ranks #10. Imaging is crucially important for early detection, diagnosis, follow-up, and treatment planning of {COPD}, lung cancer and {TB}. Chest radiography and computed tomography are the most important imaging modalities for the lung. METHODOLOGY/APPLICATION We present a flexible workstation for a quick and effective extraction of quantitative imaging parameters related to {COPD}, lung cancer and {TB}. The workstation loads an arbitrary number of {CT} and chest radiography studies of each subject simultaneously, allowing the user to instantly track the evolution of any lesion. Each {CT} scan is elastically registered to all prior {CT} scans of the same subject. Findings in prior scans have been automatically propagated and linked to findings in the current scan. All scans and processing results are preloaded in the background to ensure rapid reading. The {CIRRUS} {L}ung workstation has been developed jointly by the Diagnostic Image Analysis Group, Radboud University Nijmegen Medical Centre, Nijmegen The Netherlands, and Fraunhofer MEVIS, Bremen, Germany. It is based on the MeVisLab software platform. The workstation is available through research collaboration agreements and in active use in a variety of projects . {CIRRUS} {L}ung has a number of modes that will be demonstrated: 1) High throughput lung screening. Scan quality is automatically assessed; data with low quality, artifacts or underlying interstitial lung disease are flagged. High sensitivity computerized detection (CAD) of solid nodules and sub-solid nodules is included. High throughput reading with CAD as a first reader is supported. Each nodule is automatically characterized as solid, part-solid, non-solid, or benign (calcified lesions, perifissural lymph nodes). Volumetry, volume growth rate, mass and mass growth rate are automatically computed with advanced segmentation algorithms that have can handle sub-solid lesions and segment the solid core of part-solid nodules. Findings are summarized in a structured report. Follow-up recommendation according to Fleischner guidelines are included. 2) Clinical oncology work-up. This mode is similar to the screening mode, but includes completely automatic generation of {RECIST} workup. 3) Chest radiography lung screening. Chest radiographs can be annotated and viewed with various tools such as bone suppression and gray scale inversion. Computer-aided detection and interactive CAD reading are supported. 4) {COPD} quantification. Elastic registration between inspiration and expiration scans has been precomputed and allows for linked scrolling. Lungs, lobes, airways, fissures, and segments are automatically segmented for regional functional analysis. In case the user is not satisfied with the segmentation results, (s)he can quickly correct these with an intuitive interactive correction method. {CT} image standardization is included using a precomputed dedicated energy correction algorithm that makes quantifications less dependent on scan protocol (scanner model, kernel, iterative reconstruction). Once the segmentations have been approved, a range of quantifiable features can be visualized in the workstation: parenchyma features, airway features, and fissural completeness. Measurements are reported for both inspiration and expiration for the whole lung as well as per lobe and segment. Changes between inspiration and expiration are reported. After workup of a study of a {COPD} patient, a structured report is produced that contains screenshots, renderings, and all requested measurements. 5) {TB} Diagnostics. In this mode chest radiographs can be inspected and texture analysis that detects
-                                                       abnormalities consistent with {TB} can be inspected. A novel symmetry analysis is available to facilitate contralateral comparisons. Detection and quantification of costophrenic angle bluntness is included. Cavities can be semi-automatically segmented. DEMONSTRATION STRATEGY The exhibit will be accompanied by an informational poster that will highlight the key features and algorithmic concepts that underlie the automated analysis. Attendees will be able to gain hands-on experience with the workstation and read cases. For each reading mode, extensive example datasets are available. In particular, the completely processed {LIDC/IDRI} database, including all {CT} scans and chest radiographs, is available for inspection. REFERENCES AND PUBLICATIONS The algorithms presented in the showcase are based on over 20 different journal publications. These are listed on http://cirrus.diagnijmegen.nl.},
+                                                         abnormalities consistent with {TB} can be inspected. A novel symmetry analysis is available to facilitate contralateral comparisons. Detection and quantification of costophrenic angle bluntness is included. Cavities can be semi-automatically segmented. DEMONSTRATION STRATEGY The exhibit will be accompanied by an informational poster that will highlight the key features and algorithmic concepts that underlie the automated analysis. Attendees will be able to gain hands-on experience with the workstation and read cases. For each reading mode, extensive example datasets are available. In particular, the completely processed {LIDC/IDRI} database, including all {CT} scans and chest radiographs, is available for inspection. REFERENCES AND PUBLICATIONS The algorithms presented in the showcase are based on over 20 different journal publications. These are listed on http://cirrus.diagnijmegen.nl.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -9837,8 +9861,8 @@ @conference{Ginn18b
   year = {2018},
   doi = {10.5334/jbsr.1656},
   abstract = {Artificial intelligence (AI), particularly deep learning, is currently at the top of the hype cycle. Application of this technology to the analysis of medical images is attracting a lot of attention worldwide.
-
-                                                       At the same time, the average radiologist is using very little to no AI tools in her daily practice. This lecture provides a brief explanation of deep learning and explains what makes this technology different from previous approaches and why it is so powerful. A number of AI applications, some in use that were developed and commercialized in our research group, are presented. These applications serve as examples to define a number of different types of AI products that differ in the way they are placed in (or outside) the workflow of radiologists. This lecture emphasizes how some of these tools replace (a small part of the work of) radiologists, while other augment radiologists, and yet others take the radiologists out of the loop in the care cycle of the patient. Finally, it is discussed how radiologists can, and should, be involved in the development of real-life AI applications.},
+  
+                                                         At the same time, the average radiologist is using very little to no AI tools in her daily practice. This lecture provides a brief explanation of deep learning and explains what makes this technology different from previous approaches and why it is so powerful. A number of AI applications, some in use that were developed and commercialized in our research group, are presented. These applications serve as examples to define a number of different types of AI products that differ in the way they are placed in (or outside) the workflow of radiologists. This lecture emphasizes how some of these tools replace (a small part of the work of) radiologists, while other augment radiologists, and yet others take the radiologists out of the loop in the care cycle of the patient. Finally, it is discussed how radiologists can, and should, be involved in the development of real-life AI applications.},
   file = {Ginn18b.pdf:pdf\\Ginn18b.pdf:PDF},
   optnote = {DIAG},
   gsid = {2647745699443927210},
@@ -9991,14 +10015,14 @@ @article{Glas23
   url = {http://dx.doi.org/10.1186/s13256-023-04097-4},
   volume = {17},
   abstract = {Abstract
-                                         Background
-                                         Chest X-ray offers high sensitivity and acceptable specificity as a tuberculosis screening tool, but in areas with a high burden of tuberculosis, there is often a lack of radiological expertise to interpret chest X-ray. Computer-aided detection systems based on artificial intelligence are therefore increasingly used to screen for tuberculosis-related abnormalities on digital chest radiographies. The CAD4TB software has previously been shown to demonstrate high sensitivity for chest X-ray tuberculosis-related abnormalities, but it is not yet calibrated for the detection of non-tuberculosis abnormalities. When screening for tuberculosis, users of computer-aided detection need to be aware that other chest pathologies are likely to be as prevalent as, or more prevalent than, active tuberculosis. However, non-tuberculosis chest X-ray abnormalities detected during chest X-ray screening for tuberculosis remain poorly characterized in the sub-Saharan African setting, with only minimal literature.
-
-                                         Case presentation
-                                         In this case series, we report on four cases with non-tuberculosis abnormalities detected on CXR in TB TRIAGE + ACCURACY (ClinicalTrials.gov Identifier: NCT04666311), a study in adult presumptive tuberculosis cases at health facilities in Lesotho and South Africa to determine the diagnostic accuracy of two potential tuberculosis triage tests: computer-aided detection (CAD4TB v7, Delft, the Netherlands) and C-reactive protein (Alere Afinion, USA). The four Black African participants presented with the following chest X-ray abnormalities: a 59-year-old woman with pulmonary arteriovenous malformation, a 28-year-old man with pneumothorax, a 20-year-old man with massive bronchiectasis, and a 47-year-old woman with aspergilloma.
-
-                                         Conclusions
-                                         Solely using chest X-ray computer-aided detection systems based on artificial intelligence as a tuberculosis screening strategy in sub-Saharan Africa comes with benefits, but also risks. Due to the limitation of CAD4TB for non-tuberculosis-abnormality identification, the computer-aided detection software may miss significant chest X-ray abnormalities that require treatment, as exemplified in our four cases. Increased data collection, characterization of non-tuberculosis anomalies and research on the implications of these diseases for individuals and health systems in sub-Saharan Africa is needed to help improve existing artificial intelligence software programs and their use in countries with high tuberculosis burden.},
+                                           Background
+                                           Chest X-ray offers high sensitivity and acceptable specificity as a tuberculosis screening tool, but in areas with a high burden of tuberculosis, there is often a lack of radiological expertise to interpret chest X-ray. Computer-aided detection systems based on artificial intelligence are therefore increasingly used to screen for tuberculosis-related abnormalities on digital chest radiographies. The CAD4TB software has previously been shown to demonstrate high sensitivity for chest X-ray tuberculosis-related abnormalities, but it is not yet calibrated for the detection of non-tuberculosis abnormalities. When screening for tuberculosis, users of computer-aided detection need to be aware that other chest pathologies are likely to be as prevalent as, or more prevalent than, active tuberculosis. However, non-tuberculosis chest X-ray abnormalities detected during chest X-ray screening for tuberculosis remain poorly characterized in the sub-Saharan African setting, with only minimal literature.
+  
+                                           Case presentation
+                                           In this case series, we report on four cases with non-tuberculosis abnormalities detected on CXR in TB TRIAGE + ACCURACY (ClinicalTrials.gov Identifier: NCT04666311), a study in adult presumptive tuberculosis cases at health facilities in Lesotho and South Africa to determine the diagnostic accuracy of two potential tuberculosis triage tests: computer-aided detection (CAD4TB v7, Delft, the Netherlands) and C-reactive protein (Alere Afinion, USA). The four Black African participants presented with the following chest X-ray abnormalities: a 59-year-old woman with pulmonary arteriovenous malformation, a 28-year-old man with pneumothorax, a 20-year-old man with massive bronchiectasis, and a 47-year-old woman with aspergilloma.
+  
+                                           Conclusions
+                                           Solely using chest X-ray computer-aided detection systems based on artificial intelligence as a tuberculosis screening strategy in sub-Saharan Africa comes with benefits, but also risks. Due to the limitation of CAD4TB for non-tuberculosis-abnormality identification, the computer-aided detection software may miss significant chest X-ray abnormalities that require treatment, as exemplified in our four cases. Increased data collection, characterization of non-tuberculosis anomalies and research on the implications of these diseases for individuals and health systems in sub-Saharan Africa is needed to help improve existing artificial intelligence software programs and their use in countries with high tuberculosis burden.},
   all_ss_ids = {[6266a3b05d4812bc03baef7c75b5e64edd6d9aae]},
   automatic = {yes},
   citation-count = {0},
@@ -10018,10 +10042,10 @@ @article{Gola21
   url = {http://dx.doi.org/10.1055/a-1543-6156},
   volume = {44},
   abstract = {Abstract
-                                   Purpose In this prospective, multicenter trial we evaluated whether additional shear wave elastography (SWE) for patients with BI-RADS 3 or 4 lesions on breast ultrasound could further refine the assessment with B-mode breast ultrasound for breast cancer diagnosis.
-                                   Materials and Methods We analyzed prospective, multicenter, international data from 1288 women with breast lesions rated by conventional 2 D B-mode ultrasound as BI-RADS 3 to 4c and undergoing 2D-SWE. After reclassification with SWE the proportion of undetected malignancies should be &lt; 2 %. All patients underwent histopathologic evaluation (reference standard).
-                                   Results Histopathologic evaluation showed malignancy in 368 of 1288 lesions (28.6 %). The assessment with B-mode breast ultrasound resulted in 1.39 % (6 of 431) undetected malignancies (malignant lesions in BI-RADS 3) and 53.80 % (495 of 920) unnecessary biopsies (biopsies in benign lesions). Re-classifying BI-RADS 4a patients with a SWE cutoff of 2.55 m/s resulted in 1.98 % (11 of 556) undetected malignancies and a reduction of 24.24 % (375 vs. 495) of unnecessary biopsies.
-                                   Conclusion A SWE value below 2.55 m/s for BI-RADS 4a lesions could be used to downstage these lesions to follow-up, and therefore reduce the number of unnecessary biopsies by 24.24 %. However, this would come at the expense of some additionally missed cancers compared to B-mode breast ultrasound (rate of undetected malignancies 1.98 %, 11 of 556, versus 1.39 %, 6 of 431) which would, however, still be in line with the ACR BI-RADS 3 definition (&lt; 2 % of undetected malignancies).},
+                                     Purpose In this prospective, multicenter trial we evaluated whether additional shear wave elastography (SWE) for patients with BI-RADS 3 or 4 lesions on breast ultrasound could further refine the assessment with B-mode breast ultrasound for breast cancer diagnosis.
+                                     Materials and Methods We analyzed prospective, multicenter, international data from 1288 women with breast lesions rated by conventional 2 D B-mode ultrasound as BI-RADS 3 to 4c and undergoing 2D-SWE. After reclassification with SWE the proportion of undetected malignancies should be &lt; 2 %. All patients underwent histopathologic evaluation (reference standard).
+                                     Results Histopathologic evaluation showed malignancy in 368 of 1288 lesions (28.6 %). The assessment with B-mode breast ultrasound resulted in 1.39 % (6 of 431) undetected malignancies (malignant lesions in BI-RADS 3) and 53.80 % (495 of 920) unnecessary biopsies (biopsies in benign lesions). Re-classifying BI-RADS 4a patients with a SWE cutoff of 2.55 m/s resulted in 1.98 % (11 of 556) undetected malignancies and a reduction of 24.24 % (375 vs. 495) of unnecessary biopsies.
+                                     Conclusion A SWE value below 2.55 m/s for BI-RADS 4a lesions could be used to downstage these lesions to follow-up, and therefore reduce the number of unnecessary biopsies by 24.24 %. However, this would come at the expense of some additionally missed cancers compared to B-mode breast ultrasound (rate of undetected malignancies 1.98 %, 11 of 556, versus 1.39 %, 6 of 431) which would, however, still be in line with the ACR BI-RADS 3 definition (&lt; 2 % of undetected malignancies).},
   all_ss_ids = {[1dbf7c238e569a878500ed1defc5214f6c011366]},
   automatic = {yes},
   citation-count = {6},
@@ -10058,12 +10082,12 @@ @conference{Gome17
   booktitle = ARVO,
   year = {2017},
   abstract = {Purpose : To assess the performance of deep learning architectures based on convolutional neural networks (CNN) for the diagnosis of glaucoma in screening campaigns using color fundus images.
-
-                                                       Methods : Two independent data sets were used to develop and evaluate the proposed method. 1) 805 color fundus images with a field of view of 45 degrees, centered on the macula and including the optic disc (OD) from patients with age ranging from 55 to 86 years old included in a glaucoma detection campaign performed at Hospital Esperanza (Barcelona). Annotations were performed by eight observers having 8 to 26 years of clinical experience. 2) 101 images from the publicly available Drishti-GS retinal image dataset (http://cvit.iiit.ac.in/projects/mip/drishti-gs/mip-dataset2/Home.php). The total 906 images were further organized into a training, monitoring and test set according to a 60-20-20 split. The process to train and validate the CNN had 3 steps. 1) Preprocessing: the edges and the background were blurred to reduce the effect of the bright fringe and the border. Then patches centered at the OD of size 256x256x3 pixels were automatically segmented and scaled to values from 0 to 1. 2) Implementation: The architecture consisted of ten convolutional layers (32 filters 3x3 pixels size) followed by rectified linear units and spatial max-pooling. The network ends with a fully connected layer and a soft-max classifier which outputs a score from 0 to 1. The network was trained using stochastic gradient descent and a learning rate of 0.005. To avoid overfitting data augmentation was performed applying randomly translations, flipping and rotations during the training, and dropout with probability of 0.5. 3) Monitoring and evaluation: the training was completed after 50 epochs. To evaluate the classification capabilities of the algorithm, the area under the receiver operating characteristic curve (ROC) was calculated using the training set.
-
-                                                       Results : An automatic classification algorithm based on CNN was developed. The present method achieved an area under the ROC of 0.894. The accuracy to identify healthy and glaucoma cases was 0.884 and 0.781 respectively, using a threshold of 0.5.
-
-                                                       Conclusions : The good performance of the proposed CNN architecture suggests potential usefulness of these methods for an initial automatic classification of images in screening campaigns for glaucoma.},
+  
+                                                         Methods : Two independent data sets were used to develop and evaluate the proposed method. 1) 805 color fundus images with a field of view of 45 degrees, centered on the macula and including the optic disc (OD) from patients with age ranging from 55 to 86 years old included in a glaucoma detection campaign performed at Hospital Esperanza (Barcelona). Annotations were performed by eight observers having 8 to 26 years of clinical experience. 2) 101 images from the publicly available Drishti-GS retinal image dataset (http://cvit.iiit.ac.in/projects/mip/drishti-gs/mip-dataset2/Home.php). The total 906 images were further organized into a training, monitoring and test set according to a 60-20-20 split. The process to train and validate the CNN had 3 steps. 1) Preprocessing: the edges and the background were blurred to reduce the effect of the bright fringe and the border. Then patches centered at the OD of size 256x256x3 pixels were automatically segmented and scaled to values from 0 to 1. 2) Implementation: The architecture consisted of ten convolutional layers (32 filters 3x3 pixels size) followed by rectified linear units and spatial max-pooling. The network ends with a fully connected layer and a soft-max classifier which outputs a score from 0 to 1. The network was trained using stochastic gradient descent and a learning rate of 0.005. To avoid overfitting data augmentation was performed applying randomly translations, flipping and rotations during the training, and dropout with probability of 0.5. 3) Monitoring and evaluation: the training was completed after 50 epochs. To evaluate the classification capabilities of the algorithm, the area under the receiver operating characteristic curve (ROC) was calculated using the training set.
+  
+                                                         Results : An automatic classification algorithm based on CNN was developed. The present method achieved an area under the ROC of 0.894. The accuracy to identify healthy and glaucoma cases was 0.884 and 0.781 respectively, using a threshold of 0.5.
+  
+                                                         Conclusions : The good performance of the proposed CNN architecture suggests potential usefulness of these methods for an initial automatic classification of images in screening campaigns for glaucoma.},
   optnote = {DIAG, RADIOLOGY},
   gsid = {11583482517657678688},
   gscites = {2},
@@ -10113,12 +10137,12 @@ @conference{Gonz19a
   url = {https://iovs.arvojournals.org/article.aspx?articleid=2746850},
   title = {Opening the "black box" of deep learning in automated screening of eye diseases},
   abstract = {Purpose:  Systems based on deep learning (DL) have demonstrated to provide a scalable and high-performance solution for screening of eye diseases. However, DL is usually considered a "black box? due to lack of interpretability. We propose a deep visualization framework to explain the decisions made by a DL system, iteratively unveiling abnormalities responsible for referable predictions without needing lesion-level annotations. We apply the framework to automated screening of diabetic retinopathy (DR) in color fundus images (CFIs).
-
-                                                       Methods: The proposed framework consists of a baseline deep convolutional neural network to classify CFIs by DR stage. For each CFI classified as referable DR, the framework extracts initial visual evidence of the predicted stage by computing a saliency map, which indicates regions in the image that would contribute the most to changes in the prediction if modified. This provides localization of abnormalities that are then removed through selective inpainting. The image is again classified, expecting reduced referability. We iteratively apply this procedure to increase attention to less discriminative areas and generate refined visual evidence. The Kaggle DR database, with CFIs graded regarding DR severity (stages 0 and 1: non-referable DR, stages 2 to 4: referable DR), is used for training and validation of the image-level classification task. For validation of the obtained visual evidence, we used the DiaretDB1 dataset, which contains CFIs with manually-delineated areas for 4 types of lesions: hemorrhages, microaneurysms, hard and soft exudates.
-
-                                                       Results: The baseline classifier obtained an area under the Receiver Operating Characteristic (ROC) curve of 0.93 and a quadratic weighted kappa of 0.77 on the Kaggle test set (53576 CFIs). Free-response ROC (FROC) curves (Figure 2) analyze the correspondence between highlighted areas and each type of lesion for those images classified as referable DR in the DiaretDB1 dataset (62 CFIs), comparing between initial and refined visual evidence.
-
-                                                       Conclusions : The proposed framework provides visual evidence for the decisions made by a DL system, iteratively unveiling abnormalities in CFIs based on the prediction of a classifier trained only with image-level labels. This provides a "key? to open the "black box? of artificial intelligence in screening of eye diseases, aiming to increase experts' trust and facilitate its integration in screening settings.},
+  
+                                                         Methods: The proposed framework consists of a baseline deep convolutional neural network to classify CFIs by DR stage. For each CFI classified as referable DR, the framework extracts initial visual evidence of the predicted stage by computing a saliency map, which indicates regions in the image that would contribute the most to changes in the prediction if modified. This provides localization of abnormalities that are then removed through selective inpainting. The image is again classified, expecting reduced referability. We iteratively apply this procedure to increase attention to less discriminative areas and generate refined visual evidence. The Kaggle DR database, with CFIs graded regarding DR severity (stages 0 and 1: non-referable DR, stages 2 to 4: referable DR), is used for training and validation of the image-level classification task. For validation of the obtained visual evidence, we used the DiaretDB1 dataset, which contains CFIs with manually-delineated areas for 4 types of lesions: hemorrhages, microaneurysms, hard and soft exudates.
+  
+                                                         Results: The baseline classifier obtained an area under the Receiver Operating Characteristic (ROC) curve of 0.93 and a quadratic weighted kappa of 0.77 on the Kaggle test set (53576 CFIs). Free-response ROC (FROC) curves (Figure 2) analyze the correspondence between highlighted areas and each type of lesion for those images classified as referable DR in the DiaretDB1 dataset (62 CFIs), comparing between initial and refined visual evidence.
+  
+                                                         Conclusions : The proposed framework provides visual evidence for the decisions made by a DL system, iteratively unveiling abnormalities in CFIs based on the prediction of a classifier trained only with image-level labels. This provides a "key? to open the "black box? of artificial intelligence in screening of eye diseases, aiming to increase experts' trust and facilitate its integration in screening settings.},
   optnote = {DIAG, RADIOLOGY},
   year = {2019},
   all_ss_ids = {[80af090645088134f058db53a708b7092dd28786]},
@@ -10179,15 +10203,15 @@ @conference{Gonz20c
   url = {https://www.euretina.org/congress/amsterdam-2020/virtual-2020-freepapers/},
   title = {Are adversarial attacks an actual threat for deep learning systems in real-world eye disease screening settings?},
   abstract = {Purpose:
-                                				Deep learning (DL) systems that perform image-level classification with convolutional neural networks (CNNs) have been shown to provide high-performance solutions for automated screening of eye diseases. Nevertheless, adversarial attacks have been recently screening settings, where there is restricted access to the systems and limited knowledge about certain factors, such as their CNN architecture or the data used for development.
-                              				Setting:
-                              				Deep learning for automated screening of eye diseases.
-                              				Methods:
-                              				We used the Kaggle dataset for diabetic retinopathy detection. It contains 88,702 manually-labelled color fundus images, which we split into test (12%) and development (88%). Development data were split into two equally-sized sets (d1 and d2); a third set (d3) was generated using half of the images in d2. In each development set, 80%/20% of the images were used for training/validation. All splits were done randomly at patient-level. As attacked system, we developed a randomly-initialized CNN based on the Inception-v3 architecture using d1. We performed the attacks (1) in a white-box (WB) setting, with full access to the attacked system to generate the adversarial images, and (2) in black-box (BB) settings, without access to the attacked system and using a surrogate system to craft the attacks. We simulated different BB settings, sequentially decreasing the available knowledge about the attacked system: same architecture, using d1 (BB-1); different architecture (randomly-initialized DenseNet-121), using d1 (BB-2); same architecture, using d2 (BB-3); different architecture, using d2 (BB-4); different architecture, using d3 (BB-5). In each setting, adversarial images containing non-perceptible noise were generated by applying the fast gradient sign method to each image of the test set and processed by the attacked system.
-                              				Results:
-                              				The performance of the attacked system to detect referable diabetic retinopathy without attacks and under the different attack settings was measured on the test set using the area under the receiver operating characteristic curve (AUC). Without attacks, the system achieved an AUC of 0.88. In each attack setting, the relative decrease in AUC with respect to the original performance was computed. In the WB setting, there was a 99.9% relative decrease in performance. In the BB-1 setting, the relative decrease in AUC was 67.3%. In the BB-2 setting, the AUC suffered a 40.2% relative decrease. In the BB-3 setting, the relative decrease was 37.9%. In the BB-4 setting, the relative decrease in AUC was 34.1%. Lastly, in the BB-5 setting, the performance of the attacked system decreased 3.8% regarding its original performance.
-                              				Conclusions:
-                              				The results obtained in the different settings show a drastic decrease of the attacked DL system's vulnerability to adversarial attacks when the access and knowledge about it are limited. The impact on performance is extremely reduced when restricting the direct access to the system (from the WB to the BB-1 setting). The attacks become slightly less effective when not having access to the same development data (BB-3), compared to not using the same CNN architecture (BB-2). Attacks' effectiveness further decreases when both factors are unknown (BB-4). If the amount of development data is additionally reduced (BB-5), the original performance barely deteriorates. This last setting is the most similar to realistic screening settings, since most systems are currently closed source and use additional large private datasets for development. In conclusion, these factors should be acknowledged for future development of robust DL systems, as well as considered when evaluating the vulnerability of currently-available systems to adversarial attacks. Having limited access and knowledge about the systems determines the actual threat these attacks pose. We believe awareness about this matter will increase experts' trust and facilitate the integration of DL systems in real-world settings.},
+                                  				Deep learning (DL) systems that perform image-level classification with convolutional neural networks (CNNs) have been shown to provide high-performance solutions for automated screening of eye diseases. Nevertheless, adversarial attacks have been recently screening settings, where there is restricted access to the systems and limited knowledge about certain factors, such as their CNN architecture or the data used for development.
+                                				Setting:
+                                				Deep learning for automated screening of eye diseases.
+                                				Methods:
+                                				We used the Kaggle dataset for diabetic retinopathy detection. It contains 88,702 manually-labelled color fundus images, which we split into test (12%) and development (88%). Development data were split into two equally-sized sets (d1 and d2); a third set (d3) was generated using half of the images in d2. In each development set, 80%/20% of the images were used for training/validation. All splits were done randomly at patient-level. As attacked system, we developed a randomly-initialized CNN based on the Inception-v3 architecture using d1. We performed the attacks (1) in a white-box (WB) setting, with full access to the attacked system to generate the adversarial images, and (2) in black-box (BB) settings, without access to the attacked system and using a surrogate system to craft the attacks. We simulated different BB settings, sequentially decreasing the available knowledge about the attacked system: same architecture, using d1 (BB-1); different architecture (randomly-initialized DenseNet-121), using d1 (BB-2); same architecture, using d2 (BB-3); different architecture, using d2 (BB-4); different architecture, using d3 (BB-5). In each setting, adversarial images containing non-perceptible noise were generated by applying the fast gradient sign method to each image of the test set and processed by the attacked system.
+                                				Results:
+                                				The performance of the attacked system to detect referable diabetic retinopathy without attacks and under the different attack settings was measured on the test set using the area under the receiver operating characteristic curve (AUC). Without attacks, the system achieved an AUC of 0.88. In each attack setting, the relative decrease in AUC with respect to the original performance was computed. In the WB setting, there was a 99.9% relative decrease in performance. In the BB-1 setting, the relative decrease in AUC was 67.3%. In the BB-2 setting, the AUC suffered a 40.2% relative decrease. In the BB-3 setting, the relative decrease was 37.9%. In the BB-4 setting, the relative decrease in AUC was 34.1%. Lastly, in the BB-5 setting, the performance of the attacked system decreased 3.8% regarding its original performance.
+                                				Conclusions:
+                                				The results obtained in the different settings show a drastic decrease of the attacked DL system's vulnerability to adversarial attacks when the access and knowledge about it are limited. The impact on performance is extremely reduced when restricting the direct access to the system (from the WB to the BB-1 setting). The attacks become slightly less effective when not having access to the same development data (BB-3), compared to not using the same CNN architecture (BB-2). Attacks' effectiveness further decreases when both factors are unknown (BB-4). If the amount of development data is additionally reduced (BB-5), the original performance barely deteriorates. This last setting is the most similar to realistic screening settings, since most systems are currently closed source and use additional large private datasets for development. In conclusion, these factors should be acknowledged for future development of robust DL systems, as well as considered when evaluating the vulnerability of currently-available systems to adversarial attacks. Having limited access and knowledge about the systems determines the actual threat these attacks pose. We believe awareness about this matter will increase experts' trust and facilitate the integration of DL systems in real-world settings.},
   optnote = {DIAG, RADIOLOGY},
   year = {2020},
   month = {9},
@@ -10199,12 +10223,12 @@ @conference{Gonz21
   url = {https://iovs.arvojournals.org/article.aspx?articleid=2773295},
   title = {Hierarchical curriculum learning for robust automated detection of low-prevalence retinal disease features: application to reticular pseudodrusen},
   abstract = {Purpose: The low prevalence of certain retinal disease features compromises data collection for deep neural networks (DNN) development and, consequently, the benefits of automated detection. We robustify the detection of such features in scarce data settings by exploiting hierarchical information available in the data to learn from generic to specific, low-prevalence features. We focus on reticular pseudodrusen (RPD), a hallmark of intermediate age-related macular degeneration (AMD).
-
-                               Methods: Color fundus images (CFI) from the AREDS dataset were used for DNN development (106,994 CFI) and testing (27,066 CFI). An external test set (RS1-6) was generated with 2,790 CFI from the Rotterdam Study. In both datasets CFI were graded from generic to specific features. This allows to establish a hierarchy of binary classification tasks with decreasing prevalence: presence of AMD findings (AREDS prevalence: 88%; RS1-6: 77%), drusen (85%; 73%), large drusen (40%; 24%), RPD (1%; 4%). We created a hierarchical curriculum and developed a DNN (HC-DNN) that learned each task sequentially. We computed its performance for RPD detection in both test sets and compared it to a baseline DNN (B-DNN) that learned to detect RPD from scratch disregarding hierarchical information. We studied their robustness across datasets, while reducing the size of data available for development (same prevalences)
-
-                               Results: Area under the receiver operating characteristic curve (AUC) was used to measure RPD detection performance. When large development data were available, there was no significant difference between DNNs (100% data, HC-DNN: 0.96 (95% CI, 0.94-0.97) in AREDS, 0.82 (0.78-0.86) in RS1-6; B-DNN: 0.95 (0.94-0.96) in AREDS, 0.83 (0.79-0.87) in RS1-6). However, HC-DNN achieved better performance and robustness across datasets when development data were highly reduced (<50% data, p-values<0.05) (1% data, HC-DNN: 0.63 (0.60-0.66) in AREDS, 0.76 (0.72-0.80) in RS1-6; B-DNN: 0.53 (0.49-0.56) in AREDS, 0.48 (0.42-0.53) in RS1-6).
-
-                               Conclusions: Hierarchical curriculum learning allows for knowledge transfer from general, higher-prevalence features and becomes beneficial for the detection of low-prevalence retinal features, such as RPD, in scarce data settings. Moreover, exploiting hierarchical information improves DNN robustness across datasets.},
+  
+                                 Methods: Color fundus images (CFI) from the AREDS dataset were used for DNN development (106,994 CFI) and testing (27,066 CFI). An external test set (RS1-6) was generated with 2,790 CFI from the Rotterdam Study. In both datasets CFI were graded from generic to specific features. This allows to establish a hierarchy of binary classification tasks with decreasing prevalence: presence of AMD findings (AREDS prevalence: 88%; RS1-6: 77%), drusen (85%; 73%), large drusen (40%; 24%), RPD (1%; 4%). We created a hierarchical curriculum and developed a DNN (HC-DNN) that learned each task sequentially. We computed its performance for RPD detection in both test sets and compared it to a baseline DNN (B-DNN) that learned to detect RPD from scratch disregarding hierarchical information. We studied their robustness across datasets, while reducing the size of data available for development (same prevalences)
+  
+                                 Results: Area under the receiver operating characteristic curve (AUC) was used to measure RPD detection performance. When large development data were available, there was no significant difference between DNNs (100% data, HC-DNN: 0.96 (95% CI, 0.94-0.97) in AREDS, 0.82 (0.78-0.86) in RS1-6; B-DNN: 0.95 (0.94-0.96) in AREDS, 0.83 (0.79-0.87) in RS1-6). However, HC-DNN achieved better performance and robustness across datasets when development data were highly reduced (<50% data, p-values<0.05) (1% data, HC-DNN: 0.63 (0.60-0.66) in AREDS, 0.76 (0.72-0.80) in RS1-6; B-DNN: 0.53 (0.49-0.56) in AREDS, 0.48 (0.42-0.53) in RS1-6).
+  
+                                 Conclusions: Hierarchical curriculum learning allows for knowledge transfer from general, higher-prevalence features and becomes beneficial for the detection of low-prevalence retinal features, such as RPD, in scarce data settings. Moreover, exploiting hierarchical information improves DNN robustness across datasets.},
   optnote = {DIAG, RADIOLOGY},
   year = {2021},
 }
@@ -10215,14 +10239,14 @@ @conference{Gonz21a
   title = {Deep learning for automated stratification of ophthalmic images: Application to age-related macular degeneration and color fundus images},
   url = {https://euretina.org/resource/abstract_2021_deep-learning-for-automated-stratification-of-ophthalmic-images-application-to-age-related-macular-degeneration-and-color-fundus-images/},
   abstract = {Purpose: Deep learning (DL) systems based on convolutional neural networks (CNNs) have achieved expert-level performance in different classification tasks, and have shown the potential to reduce current experts' workload significantly. We explore this potential in the context of automated stratification of ophthalmic images. DL could accelerate the setup of clinical studies by filtering large amounts of images or patients based on specific inclusion criteria, as well as aid in patient selection for clinical trials. DL could also allow for automated categorization of entering images in busy clinical or screening settings, enhancing data triaging, searching, retrieval, and comparison. Automated stratification could also facilitate data collection and application of further DL-based phenotyping analysis, by generating useful sets of images for expert annotation, training, or testing of segmentation algorithms. In our work, we focus on the stratification of color fundus images (CFI) based on multiple features related to age-related macular degeneration (AMD) at different hierarchical levels. We further analyze the robustness of the automated stratification system when the amount of data available for development is limited. We performed our validation on two different population studies.
-
-                               Setting/Venue: Deep learning applied to ophthalmic imaging.
-
-                               Methods: Automated stratification of CFI was performed based on the presence or absence of the following AMD features, following a hierarchical tree with different branches (Bi) and levels (Hi) from generic features (H0) to specific features (H3): AMD findings (H0); B1: drusen (H1), large drusen (H2), reticular pseudodrusen (H3); B2: pigmentary changes (H1), hyperpigmentation (H2), hypopigmentation (H2); B3: late AMD (H1), geographic atrophy (H2), choroidal neovascularization (H2). The automated stratification system consisted of a set of CNNs (based on the Inception-v3 architecture) able to classify the multiple AMD features (presence/absence) at higher and lower levels. This allowed to automatically stratify incoming CFI into the hierarchical tree. CFI from the AREDS dataset were used for development (106,994 CFI) and testing (27,066 CFI) of the CNNs. We validated the robustness of the system to a gradual decrease in the amount of data available for development (100%, 75%, 50%, 25%, 10%, 5%, 2.5%, and 1% of development data). An external test set (RS1-6) was generated with 2,790 CFI from the Rotterdam Study. This allowed to validate the performance of the automated stratification across studies where different CFI grading protocols were used.
-
-                               Results: Area under the receiver operating characteristic curve (AUC) was used to measure the performance of each feature's classification within the automated stratification. The AUC averaged across AMD features when 100% of development data was available was 93.8% (95% CI, 93.4%-94.2%) in AREDS and 84.4% (82.1%-86.5%) in RS1-6. There was an average relative decrease in performance of 10.0+-4.7% between AREDS and the external test set, RS1-6. The performance of the system decreased gradually with each development data reduction. When only 1% of data was available for development, the average AUC was 81.9% (81.0%-82.8%) in AREDS and 74.0% (70.8%-77.0%) in RS1-6. This corresponded to an average relative decrease in performance of 12.7+-13.2% in AREDS and 12.6+-7.8% in RS1-6.
-
-                               Conlusions: The automated stratification system achieved overall high performance in the classification of different features independently of their hierarchical level. This shows the potential of DL systems to identify diverse phenotypes and to obtain an accurate automated stratification of CFI. The results showed that automated stratification was also robust to a dramatic reduction in the data available for development, maintaining the average AUC above 80%. This is a positive observation, considering that the amount of data available for DL development can be limited in some settings, and the gradings can be costly to obtain. Nevertheless, variability in performance across features could be observed, especially for those with very low prevalence, such as reticular pseudodrusen, where performance became more unstable when few data were available. The external validation showed these observations held when the automated stratification was applied in a different population study, with an expected (but not drastic) drop of performance due to differences between datasets and their grading protocols. In conclusion, our work supports that DL is a powerful tool for the filtering and stratification of ophthalmic images, and has the potential to reduce the workload of experts while supporting them in research and clinical settings.},
+  
+                                 Setting/Venue: Deep learning applied to ophthalmic imaging.
+  
+                                 Methods: Automated stratification of CFI was performed based on the presence or absence of the following AMD features, following a hierarchical tree with different branches (Bi) and levels (Hi) from generic features (H0) to specific features (H3): AMD findings (H0); B1: drusen (H1), large drusen (H2), reticular pseudodrusen (H3); B2: pigmentary changes (H1), hyperpigmentation (H2), hypopigmentation (H2); B3: late AMD (H1), geographic atrophy (H2), choroidal neovascularization (H2). The automated stratification system consisted of a set of CNNs (based on the Inception-v3 architecture) able to classify the multiple AMD features (presence/absence) at higher and lower levels. This allowed to automatically stratify incoming CFI into the hierarchical tree. CFI from the AREDS dataset were used for development (106,994 CFI) and testing (27,066 CFI) of the CNNs. We validated the robustness of the system to a gradual decrease in the amount of data available for development (100%, 75%, 50%, 25%, 10%, 5%, 2.5%, and 1% of development data). An external test set (RS1-6) was generated with 2,790 CFI from the Rotterdam Study. This allowed to validate the performance of the automated stratification across studies where different CFI grading protocols were used.
+  
+                                 Results: Area under the receiver operating characteristic curve (AUC) was used to measure the performance of each feature's classification within the automated stratification. The AUC averaged across AMD features when 100% of development data was available was 93.8% (95% CI, 93.4%-94.2%) in AREDS and 84.4% (82.1%-86.5%) in RS1-6. There was an average relative decrease in performance of 10.0+-4.7% between AREDS and the external test set, RS1-6. The performance of the system decreased gradually with each development data reduction. When only 1% of data was available for development, the average AUC was 81.9% (81.0%-82.8%) in AREDS and 74.0% (70.8%-77.0%) in RS1-6. This corresponded to an average relative decrease in performance of 12.7+-13.2% in AREDS and 12.6+-7.8% in RS1-6.
+  
+                                 Conlusions: The automated stratification system achieved overall high performance in the classification of different features independently of their hierarchical level. This shows the potential of DL systems to identify diverse phenotypes and to obtain an accurate automated stratification of CFI. The results showed that automated stratification was also robust to a dramatic reduction in the data available for development, maintaining the average AUC above 80%. This is a positive observation, considering that the amount of data available for DL development can be limited in some settings, and the gradings can be costly to obtain. Nevertheless, variability in performance across features could be observed, especially for those with very low prevalence, such as reticular pseudodrusen, where performance became more unstable when few data were available. The external validation showed these observations held when the automated stratification was applied in a different population study, with an expected (but not drastic) drop of performance due to differences between datasets and their grading protocols. In conclusion, our work supports that DL is a powerful tool for the filtering and stratification of ophthalmic images, and has the potential to reduce the workload of experts while supporting them in research and clinical settings.},
   optnote = {DIAG, RADIOLOGY},
   year = {2021},
 }
@@ -10233,20 +10257,20 @@ @conference{Gonz21b
   url = {https://journals.sagepub.com/doi/full/10.1177/11206721211047031},
   title = {Trustworthy AI: closing the gap between development and integration of AI in Ophthalmology},
   abstract = {Design: Descriptive study.
-
-                               Purpose: To identify the main aspects that currently complicate the integration of artificial intelligence (AI) in ophthalmic settings.
-
-                               Methods: Based on an extensive review of state-of-the-art literature of AI applied to Ophthalmology plus interviews with multidisciplinary, international experts, we identified the most relevant aspects to consider during AI design to generate trustworthy (i.e.,  transparent, robust, and sustainable) AI systems and, consequently, facilitate a subsequent successful integration in real-world ophthalmic settings.
-
-                               Results: Several essential aspects to consider were identified:
-                               1) The reliability of the human annotations that are used for establishing the reference standard an AI system learns from, or for setting robust observer studies that allow for fair human-AI performance comparison.
-                               2) The ability of an AI system to generalize across populations, ophthalmic settings, and data acquisition protocols in order to avoid the negative consequences of algorithmic bias and lack of domain adaptation.
-                               3)The integration of multimodal data for AI development to consider multiple contexts when available (phenotyping, genotyping, systemic variables, patient medical history...).
-                               4) The importance of providing interpretable AI-based predictions to open the "black box" and increase trust and clinical usability.
-                               5) A plan to monitor the impact of AI on the clinical workflow, i.e., the adaptation of healthcare providers and patients to the new technology, human-AI interaction, cost-benefit analyses...
-                               6) The necessity to update current regulations to accelerate and control AI integration and all related aspects, such as patient privacy, systems' updates, and liability.
-
-                               Conclusions: It is important that healthcare providers in Ophthalmology consider these aspects and their consequences when thinking of AI in practice. It is key that all involved stakeholders collaborate and interact from the beginning of the AI design process to ensure a good alignment with real-world clinical needs and settings. This way, it will be possible to generate trustworthy AI solutions and close the gap between development and deployment, so that the AI benefits currently shown on paper reach the final users.},
+  
+                                 Purpose: To identify the main aspects that currently complicate the integration of artificial intelligence (AI) in ophthalmic settings.
+  
+                                 Methods: Based on an extensive review of state-of-the-art literature of AI applied to Ophthalmology plus interviews with multidisciplinary, international experts, we identified the most relevant aspects to consider during AI design to generate trustworthy (i.e.,  transparent, robust, and sustainable) AI systems and, consequently, facilitate a subsequent successful integration in real-world ophthalmic settings.
+  
+                                 Results: Several essential aspects to consider were identified:
+                                 1) The reliability of the human annotations that are used for establishing the reference standard an AI system learns from, or for setting robust observer studies that allow for fair human-AI performance comparison.
+                                 2) The ability of an AI system to generalize across populations, ophthalmic settings, and data acquisition protocols in order to avoid the negative consequences of algorithmic bias and lack of domain adaptation.
+                                 3)The integration of multimodal data for AI development to consider multiple contexts when available (phenotyping, genotyping, systemic variables, patient medical history...).
+                                 4) The importance of providing interpretable AI-based predictions to open the "black box" and increase trust and clinical usability.
+                                 5) A plan to monitor the impact of AI on the clinical workflow, i.e., the adaptation of healthcare providers and patients to the new technology, human-AI interaction, cost-benefit analyses...
+                                 6) The necessity to update current regulations to accelerate and control AI integration and all related aspects, such as patient privacy, systems' updates, and liability.
+  
+                                 Conclusions: It is important that healthcare providers in Ophthalmology consider these aspects and their consequences when thinking of AI in practice. It is key that all involved stakeholders collaborate and interact from the beginning of the AI design process to ensure a good alignment with real-world clinical needs and settings. This way, it will be possible to generate trustworthy AI solutions and close the gap between development and deployment, so that the AI benefits currently shown on paper reach the final users.},
   optnote = {DIAG, RADIOLOGY},
   year = {2021},
 }
@@ -10271,14 +10295,14 @@ @phdthesis{Gonz23
   year = {2023},
   url = {https://repository.ubn.ru.nl/handle/2066/299173},
   abstract = {This thesis contributes to the current research (academic and industrial), regulatory, and ethical landscapes by advancing our understanding of trustworthiness in AI systems in healthcare, particularly in the context of ophthalmology. Its overall objective is to provide insights, explore solutions, and provide recommendations for the development of trustworthy DL-based systems, thereby contributing to lessen the existing gap between the development and integration of AI in healthcare and ophthalmology.
-
-         Chapter 2. In this chapter, we study the reliability of a CE-certified, DL-based device for the joint automated screening of DR and AMD in CFP. By performing an external, multi-center validation, we investigate the ability of the commercially-available system to generalize across populations and imaging acquisition protocols. We also compare its performance to that of a group of international retinal experts, and explore the consistency of human observers when it comes to DR and AMD grading. Our work supports that AI can facilitate access to joint screening of retinal diseases and that currently available AI solutions can provide reliable and objective support to eye care providers.
-
-         Chapter 3. In this chapter, we focus on the explainability of DL systems' decisions and its impact on trust and clinical usability. We propose a deep visualization method, called visual evidence augmentation, to enhance DL models' explainability in classification tasks in medical imaging. The novel method combines visual attribution and selective inpainting and iteratively unveils abnormalities responsible for anomalous predictions, without the need of manual, lesion-level annotations. We apply the method to automated screening of DR and AMD in CFP, and demonstrate its ability to improve weakly-supervised localization of different types of abnormalities. With this work, we contribute to opening the "black box" of AI and hence increasing experts' trust and facilitating its integration in clinical settings.
-
-         Chapter 4. In this chapter, we focus on the robustness of DL systems against malicious attacks and the importance of defining their actual threat. We study previously unexplored factors affecting the vulnerability of DL systems to adversarial attacks in three different medical applications and imaging modalities: screening for referable DR in CFP, classification of pathologies in chest X-Ray, and detection of breast cancer metastasis in histopathology slides of lymph node sections. We demonstrate that ImageNet pre-training, commonly used in medical imaging, may substantially increase adversarial attack vulnerability, and that disparity in the training data of the target and the attacker's model decreases attack performance. This work also provides recommendations to increase the safety of DL systems meant to be clinically mdeployed and to perform realistic evaluations of adversarial robustness.
-
-         Chapter 5. In this chapter, we explore the main aspects and challenges to be considered along the AI design pipeline in ophthalmology so as to generate systems that meet the requirements to be deemed trustworthy, including those concerning accuracy, resiliency, reliability, safety, and accountability. We elaborate on mechanisms to address those aspects and challenges at specific points of patient care, and define the roles, responsibilities, and interactions between the different stakeholders involved in AI for ophthalmic care. This study plays a role in establishing the basis for a greatly needed collaborative approach, as well as identifying key action points to ensure the potential benefits of AI reach real-world ophthalmic settings. The main findings from this work can be translated to other medical specialties.},
+  
+           Chapter 2. In this chapter, we study the reliability of a CE-certified, DL-based device for the joint automated screening of DR and AMD in CFP. By performing an external, multi-center validation, we investigate the ability of the commercially-available system to generalize across populations and imaging acquisition protocols. We also compare its performance to that of a group of international retinal experts, and explore the consistency of human observers when it comes to DR and AMD grading. Our work supports that AI can facilitate access to joint screening of retinal diseases and that currently available AI solutions can provide reliable and objective support to eye care providers.
+  
+           Chapter 3. In this chapter, we focus on the explainability of DL systems' decisions and its impact on trust and clinical usability. We propose a deep visualization method, called visual evidence augmentation, to enhance DL models' explainability in classification tasks in medical imaging. The novel method combines visual attribution and selective inpainting and iteratively unveils abnormalities responsible for anomalous predictions, without the need of manual, lesion-level annotations. We apply the method to automated screening of DR and AMD in CFP, and demonstrate its ability to improve weakly-supervised localization of different types of abnormalities. With this work, we contribute to opening the "black box" of AI and hence increasing experts' trust and facilitating its integration in clinical settings.
+  
+           Chapter 4. In this chapter, we focus on the robustness of DL systems against malicious attacks and the importance of defining their actual threat. We study previously unexplored factors affecting the vulnerability of DL systems to adversarial attacks in three different medical applications and imaging modalities: screening for referable DR in CFP, classification of pathologies in chest X-Ray, and detection of breast cancer metastasis in histopathology slides of lymph node sections. We demonstrate that ImageNet pre-training, commonly used in medical imaging, may substantially increase adversarial attack vulnerability, and that disparity in the training data of the target and the attacker's model decreases attack performance. This work also provides recommendations to increase the safety of DL systems meant to be clinically mdeployed and to perform realistic evaluations of adversarial robustness.
+  
+           Chapter 5. In this chapter, we explore the main aspects and challenges to be considered along the AI design pipeline in ophthalmology so as to generate systems that meet the requirements to be deemed trustworthy, including those concerning accuracy, resiliency, reliability, safety, and accountability. We elaborate on mechanisms to address those aspects and challenges at specific points of patient care, and define the roles, responsibilities, and interactions between the different stakeholders involved in AI for ophthalmic care. This study plays a role in establishing the basis for a greatly needed collaborative approach, as well as identifying key action points to ensure the potential benefits of AI reach real-world ophthalmic settings. The main findings from this work can be translated to other medical specialties.},
   copromotor = {S\'{a}nchez, Clara I. and B. van Ginneken},
   file = {Gonz23.pdf:pdf\\Gonz23.pdf:PDF},
   optnote = {DIAG},
@@ -10377,10 +10401,10 @@ @conference{Graa24b
   booktitle = ECR,
   title = {External validation of the Sybil risk model as a tool to identify low-risk individuals eligible for biennial lung cancer screening},
   abstract = {Purpose: Lung cancer screening protocols for follow up intervals should minimise harm, maximise cost-effectiveness, and avoid diagnostic delays. ILST suggests biennial follow-up for low-risk participants. The study aimed to retrospectively evaluate Sybil, a deep learning algorithm predicting lung cancer risk for 6 years from one LDCT, comparing it to PanCan2b for identifying biennial screening eligibility.
-               Methods and materials: DLCST baseline scans included 1870 non-cancer and 25 screen-detected cancer cases, diagnosed within 2 years. Sybil (scan level) and PanCan2b (per nodule) predicted risk of developing cancer within 2 years. For cases with no screen-annotated nodules, the PanCan2b risk score for participants was set as 0%. For both models, we used a nodule-risk cut-off of <1.5% to identify low-risk participants for biennial follow-up, based on ILST. For PanCan2b, the risk dominant nodule per scan was considered.
-               Results: The Sybil and PanCan2B models identified 1616 and 1697 individuals, respectively, meeting the criteria for biennial screening. This would result in a reduction of 87% and 94% of CT scans in the second screening round, respectively. The group referred for biennial screening included 8 and 9 cancers for Sybil and PanCan2B, respectively.
-               Conclusion: Both Sybil and PanCan2B selected a large group of low-risk participants for biennial screening when a <1.5% risk threshold was used at baseline CT. The difference between Sybil and the PanCan2b model is small. More research is needed to study the type of cancers with delayed diagnosis and whether such delay leads to diagnostic stage shift. In addition, more external validation of the Sybil model on other datasets is necessary to further assess its applicability in lung cancer screening, and to evaluate its performance on follow-up imaging.
-               Limitations: This study is a baseline, retrospective analysis on data from one screening trial.},
+                 Methods and materials: DLCST baseline scans included 1870 non-cancer and 25 screen-detected cancer cases, diagnosed within 2 years. Sybil (scan level) and PanCan2b (per nodule) predicted risk of developing cancer within 2 years. For cases with no screen-annotated nodules, the PanCan2b risk score for participants was set as 0%. For both models, we used a nodule-risk cut-off of <1.5% to identify low-risk participants for biennial follow-up, based on ILST. For PanCan2b, the risk dominant nodule per scan was considered.
+                 Results: The Sybil and PanCan2B models identified 1616 and 1697 individuals, respectively, meeting the criteria for biennial screening. This would result in a reduction of 87% and 94% of CT scans in the second screening round, respectively. The group referred for biennial screening included 8 and 9 cancers for Sybil and PanCan2B, respectively.
+                 Conclusion: Both Sybil and PanCan2B selected a large group of low-risk participants for biennial screening when a <1.5% risk threshold was used at baseline CT. The difference between Sybil and the PanCan2b model is small. More research is needed to study the type of cancers with delayed diagnosis and whether such delay leads to diagnostic stage shift. In addition, more external validation of the Sybil model on other datasets is necessary to further assess its applicability in lung cancer screening, and to evaluate its performance on follow-up imaging.
+                 Limitations: This study is a baseline, retrospective analysis on data from one screening trial.},
   optnote = {DIAG, RADIOLOGY},
   year = {2024},
 }
@@ -10390,13 +10414,13 @@ @conference{Graa24c
   booktitle = ESTI,
   title = {Assessing the agreement between privacy-preserving Llama model and human experts when labelling radiology reports for specific significant incidental findings in lung cancer screening},
   abstract = {Purpose/Objectives:
-      The ERS/ESTS/ESTRO/ESR/ESTI/EFOMP statement on management of incidental findings from low dose CT screening for lung cancer recognizes nine specific significant incidental findings (SIFs). To detect these SIFs efficiently and accurately during lung cancer screening, artificial intelligence algorithms may aid human experts. Automatic identification of scans with SIFs would be required to effectively train AI systems to detect these SIFs. In this study, we investigate the agreement between out-of-box Llama2-7b-chat, privacy-preserving, large language model (LLM) and human experts in labeling SIFs in thorax-abdomen radiology reports.
-      Methods & Materials:
-      In this study, 100 CT radiology reports from Radboud University Medical Center were examined for nine specific SIFs. The LLM generated outputs for the presence (1) or absence (0) of the SIFs, using an engineered system prompt. Agreements between LLM outputs and two human experts, the agreement between five runs of the LLM, and the agreement between the two human experts was assessed using Fleiss k, with bootstrapped 95% confidence intervals.
-      Results:
-      The interobserver agreement for the nine specific SIFs for the two human experts was substantial, median k = 0.763 (0.479, 0.891). The agreement between the LLM and the first human expert was moderate, median k = 0.427 (0.181, 0.661) and between the LLM and the second human expert was fair, median k = 0.395 (0.101, 0.619), for the nine SIFs. The agreement between the five runs of the LLM was almost perfect k = 0.970 (0.912, 1.000). When analyzing agreement in specific SIFs, we found substantial agreement between the LLM and each observer for bronchiectasis (k = 0.667 (0.327, 0.884) and k = 0.634 (0.293, 0.878)), and coronary artery calcifications (k = 0.627 (0.433, 0.792) and k = 0.610 (0.381, 0.793), and poor agreement for thyroid abnormalities (k = -0.027 (-0.048, -0.005), k = -0.020 (-0.042, -0.005)). The agreement between the human experts was substantial for bronchiectasis k = 0.889 (0.693, 1.000), and for thyroid abnormalities k = 0.884 (0.479, 1.000), and poor for interstitial lung abnormalities k = 0.321 (-0.053, 0.628), and for mediastinal masses k = 0.313 (-0.036, 0.795).
-      Conclusion:
-      This study demonstrates that there is a large difference between the agreement of each human observer and the LLM, compared to the agreement between the two human observers. This study highlights the potential for LLMs to automatically label radiology reports for SIFs and indicates which SIFs can be more reliably labeled than others. Further research is needed, to fine tune the LLM with labelled radiology reports to improve agreement with human experts, when labelling for SIFs.},
+        The ERS/ESTS/ESTRO/ESR/ESTI/EFOMP statement on management of incidental findings from low dose CT screening for lung cancer recognizes nine specific significant incidental findings (SIFs). To detect these SIFs efficiently and accurately during lung cancer screening, artificial intelligence algorithms may aid human experts. Automatic identification of scans with SIFs would be required to effectively train AI systems to detect these SIFs. In this study, we investigate the agreement between out-of-box Llama2-7b-chat, privacy-preserving, large language model (LLM) and human experts in labeling SIFs in thorax-abdomen radiology reports.
+        Methods & Materials:
+        In this study, 100 CT radiology reports from Radboud University Medical Center were examined for nine specific SIFs. The LLM generated outputs for the presence (1) or absence (0) of the SIFs, using an engineered system prompt. Agreements between LLM outputs and two human experts, the agreement between five runs of the LLM, and the agreement between the two human experts was assessed using Fleiss k, with bootstrapped 95% confidence intervals.
+        Results:
+        The interobserver agreement for the nine specific SIFs for the two human experts was substantial, median k = 0.763 (0.479, 0.891). The agreement between the LLM and the first human expert was moderate, median k = 0.427 (0.181, 0.661) and between the LLM and the second human expert was fair, median k = 0.395 (0.101, 0.619), for the nine SIFs. The agreement between the five runs of the LLM was almost perfect k = 0.970 (0.912, 1.000). When analyzing agreement in specific SIFs, we found substantial agreement between the LLM and each observer for bronchiectasis (k = 0.667 (0.327, 0.884) and k = 0.634 (0.293, 0.878)), and coronary artery calcifications (k = 0.627 (0.433, 0.792) and k = 0.610 (0.381, 0.793), and poor agreement for thyroid abnormalities (k = -0.027 (-0.048, -0.005), k = -0.020 (-0.042, -0.005)). The agreement between the human experts was substantial for bronchiectasis k = 0.889 (0.693, 1.000), and for thyroid abnormalities k = 0.884 (0.479, 1.000), and poor for interstitial lung abnormalities k = 0.321 (-0.053, 0.628), and for mediastinal masses k = 0.313 (-0.036, 0.795).
+        Conclusion:
+        This study demonstrates that there is a large difference between the agreement of each human observer and the LLM, compared to the agreement between the two human observers. This study highlights the potential for LLMs to automatically label radiology reports for SIFs and indicates which SIFs can be more reliably labeled than others. Further research is needed, to fine tune the LLM with labelled radiology reports to improve agreement with human experts, when labelling for SIFs.},
   optnote = {DIAG, RADIOLOGY},
   year = {2024},
 }
@@ -11086,10 +11110,10 @@ @mastersthesis{Hack21
   author = {Roel HACKING},
   title = {Combining CT scans and clinical features for improved automated COVID-19 detection},
   abstract = {During the first peak of the COVID-19 pandemic, hospitals in hard-hit regions were overflowing with patients at the emergency unit with respiratory complaints. Since the RT-PCR test was in limited supply at the time and test results took a long time to obtain, many hospitals opted to use chest CT scans of COVID-19 suspects.
-                              As a result of this, several studies examined the possibility of automating the detection of COVID-19 in CT scans. One such study, by Lessmann et al., 2020, developed a model to predict COVID-19 severity scores based on these chest CT scans. In this thesis, we extended their model in several ways to take into account additional clinical values (such as blood values, sex, and age) to predict either PCR outcomes or clinical diagnoses.
-                              Based on data from the Canisius-Wilhelmina Ziekenhuis (CWZ) hospital and Radboudumc hospitals, as well as the COVID-19 dataset by Ning et al., 2020, we found that integrating these two modalities can indeed lead to improved performance when both clinical and visual features are of sufficient quality. When training on data from the CWZ hospital and evaluating on data from the Radboudumc hospital, models using only clinical features or visual features achieved Area Under the ROC Curve (AUC) values of 0.773 and 0.826, respectively; their combination resulted in an AUC of 0.851.
-                              Similarly, when training on data from the Union hospital in the iCTCF dataset and predicting on data from the Union hospital in that same dataset, we obtained AUCs of 0.687 and 0.812 for clinical and visual features, respectively; their combination resulted in an AUC of 0.862.
-                              However, we also discovered that the patterns of missing data present in these clinical feature datasets can play an essential role in the performance of the models fitted on them. We thus developed additional methods to analyze and mitigate this effect to obtain fairer evaluations and increase model generalizability. Still, the high diagnostic performance of some of our models suggests that they could be adapted into clinical practice, and our methods pertaining to missing data could be used to aid further research using clinical feature datasets.},
+                                As a result of this, several studies examined the possibility of automating the detection of COVID-19 in CT scans. One such study, by Lessmann et al., 2020, developed a model to predict COVID-19 severity scores based on these chest CT scans. In this thesis, we extended their model in several ways to take into account additional clinical values (such as blood values, sex, and age) to predict either PCR outcomes or clinical diagnoses.
+                                Based on data from the Canisius-Wilhelmina Ziekenhuis (CWZ) hospital and Radboudumc hospitals, as well as the COVID-19 dataset by Ning et al., 2020, we found that integrating these two modalities can indeed lead to improved performance when both clinical and visual features are of sufficient quality. When training on data from the CWZ hospital and evaluating on data from the Radboudumc hospital, models using only clinical features or visual features achieved Area Under the ROC Curve (AUC) values of 0.773 and 0.826, respectively; their combination resulted in an AUC of 0.851.
+                                Similarly, when training on data from the Union hospital in the iCTCF dataset and predicting on data from the Union hospital in that same dataset, we obtained AUCs of 0.687 and 0.812 for clinical and visual features, respectively; their combination resulted in an AUC of 0.862.
+                                However, we also discovered that the patterns of missing data present in these clinical feature datasets can play an essential role in the performance of the models fitted on them. We thus developed additional methods to analyze and mitigate this effect to obtain fairer evaluations and increase model generalizability. Still, the high diagnostic performance of some of our models suggests that they could be adapted into clinical practice, and our methods pertaining to missing data could be used to aid further research using clinical feature datasets.},
   file = {Hack21.pdf:pdf/Hack21.pdf:PDF},
   optnote = {DIAG},
   school = {Radboud University},
@@ -11103,14 +11127,14 @@ @conference{Hadd19
   booktitle = {EACR},
   year = {2019},
   abstract = {Background and Objective:
-                            A three-dimensional visualization of a human carcinoma could provide invaluable diagnostic information and redefine how we perceive and analyze cancer invasion. As deep learning begins automating the diagnostic workflow and cutting-edge microcopy provides unprecedented ways of visualizing tissue, combining these methologies could provide novel insight into malignant tumors and other pathologic entities. By combining Knife-Edge Scanning Microscopy with convolutional neural networks, we set out to visualize an entire threedimensional colorectal carcinoma segmented into specific tissue classifications.
-
-                          Methods:
-                          A Knife-Edge Scanning Microscope (KESM), developed by Strateos (San Francisco, CA, USA), was used to digitize a whole-mount, H&E stained, formalinfixed paraffin-embedded human tissue specimen obtained from the Radboudumc (Nijmegen, Netherlands). Sparse manual annotations of 5 tissue types (tumor, stroma, muscle, healthy glands, background) were provided using KESM data to train a convolutional neural network developed by the Computational Pathology Group (Radboudumc) for semantic segmentation of the colorectal carcinoma tissue. The three-dimensional visualization was generated using 3Scan's proprietary visualization pipeline.
-
-                          Results: The convolutional neural network was used to process roughly 1200 slices of KESM data. The stitched and rendered segmentation maps demonstrate the formalin-fixed paraffin-embedded carcinoma of roughly 5 millimeters in depth. As shown in the figure, the tumor invasive margin can be seen advancing into the surrounding tumor stroma.
-
-                          Conclusion: Based on our findings, we were capable of training a segmentation model on the 3D KESM data to create an accurate representation of a formalin-fixed paraffin-embedded colorectal carcinoma tissue block segmented into five tissue classifications. Going forward, this can have much broader implications on the research and understanding of invasive tumors.},
+                              A three-dimensional visualization of a human carcinoma could provide invaluable diagnostic information and redefine how we perceive and analyze cancer invasion. As deep learning begins automating the diagnostic workflow and cutting-edge microcopy provides unprecedented ways of visualizing tissue, combining these methologies could provide novel insight into malignant tumors and other pathologic entities. By combining Knife-Edge Scanning Microscopy with convolutional neural networks, we set out to visualize an entire threedimensional colorectal carcinoma segmented into specific tissue classifications.
+  
+                            Methods:
+                            A Knife-Edge Scanning Microscope (KESM), developed by Strateos (San Francisco, CA, USA), was used to digitize a whole-mount, H&E stained, formalinfixed paraffin-embedded human tissue specimen obtained from the Radboudumc (Nijmegen, Netherlands). Sparse manual annotations of 5 tissue types (tumor, stroma, muscle, healthy glands, background) were provided using KESM data to train a convolutional neural network developed by the Computational Pathology Group (Radboudumc) for semantic segmentation of the colorectal carcinoma tissue. The three-dimensional visualization was generated using 3Scan's proprietary visualization pipeline.
+  
+                            Results: The convolutional neural network was used to process roughly 1200 slices of KESM data. The stitched and rendered segmentation maps demonstrate the formalin-fixed paraffin-embedded carcinoma of roughly 5 millimeters in depth. As shown in the figure, the tumor invasive margin can be seen advancing into the surrounding tumor stroma.
+  
+                            Conclusion: Based on our findings, we were capable of training a segmentation model on the 3D KESM data to create an accurate representation of a formalin-fixed paraffin-embedded colorectal carcinoma tissue block segmented into five tissue classifications. Going forward, this can have much broader implications on the research and understanding of invasive tumors.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -11163,10 +11187,10 @@ @article{Hadj22
   year = {2022},
   doi = {https://doi.org/10.1002/mp.16188},
   abstract = {Rapid advances in artificial intelligence (AI) and machine learning, and specifically in deep learning (DL) techniques, have enabled broad application of these methods in health care. The promise of the DL approach has spurred further interest in computer-aided diagnosis (CAD) development and applications using both "traditional" machine learning methods and newer DL-based methods. We use the term CAD-AI to refer to this expanded clinical decision support environment that uses traditional and DL-based AI methods.
-
-                          Numerous studies have been published to date on the development of machine learning tools for computer-aided, or AI-assisted, clinical tasks. However, most of these machine learning models are not ready for clinical deployment. It is of paramount importance to ensure that a clinical decision support tool undergoes proper training and rigorous validation of its generalizability and robustness before adoption for patient care in the clinic.
-
-                          To address these important issues, the American Association of Physicists in Medicine (AAPM) Computer-Aided Image Analysis Subcommittee (CADSC) is charged, in part, to develop recommendations on practices and standards for the development and performance assessment of computer-aided decision support systems. The committee has previously published two opinion papers on the evaluation of CAD systems and issues associated with user training and quality assurance of these systems in the clinic. With machine learning techniques continuing to evolve and CAD applications expanding to new stages of the patient care process, the current task group report considers the broader issues common to the development of most, if not all, CAD-AI applications and their translation from the bench to the clinic. The goal is to bring attention to the proper training and validation of machine learning algorithms that may improve their generalizability and reliability and accelerate the adoption of CAD-AI systems for clinical decision support.},
+  
+                            Numerous studies have been published to date on the development of machine learning tools for computer-aided, or AI-assisted, clinical tasks. However, most of these machine learning models are not ready for clinical deployment. It is of paramount importance to ensure that a clinical decision support tool undergoes proper training and rigorous validation of its generalizability and robustness before adoption for patient care in the clinic.
+  
+                            To address these important issues, the American Association of Physicists in Medicine (AAPM) Computer-Aided Image Analysis Subcommittee (CADSC) is charged, in part, to develop recommendations on practices and standards for the development and performance assessment of computer-aided decision support systems. The committee has previously published two opinion papers on the evaluation of CAD systems and issues associated with user training and quality assurance of these systems in the clinic. With machine learning techniques continuing to evolve and CAD applications expanding to new stages of the patient care process, the current task group report considers the broader issues common to the development of most, if not all, CAD-AI applications and their translation from the bench to the clinic. The goal is to bring attention to the proper training and validation of machine learning algorithms that may improve their generalizability and reliability and accelerate the adoption of CAD-AI systems for clinical decision support.},
   file = {Hadj22.pdf:pdf\\Hadj22.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   ss_id = {df2cedb6640c9c7e0627fb03cf26b49e82a154b0},
@@ -11407,21 +11431,21 @@ @article{Harl21
   url = {http://dx.doi.org/10.1093/rheumatology/keab835},
   volume = {61},
   abstract = {Abstract
-
-                                           Objectives
-                                           Earlier retrospective studies have suggested a relation between DISH and cardiovascular disease, including myocardial infarction. The present study assessed the association between DISH and incidence of cardiovascular events and mortality in patients with high cardiovascular risk.
-
-
-                                           Methods
-                                           In this prospective cohort study, we included 4624 patients (mean age 58.4 years, 69.6% male) from the Second Manifestations of ARTerial disease cohort. The main end point was major cardiovascular events (MACE: stroke, myocardial infarction and vascular death). Secondary endpoints included all-cause mortality and separate vascular events. Cause-specific proportional hazard models were used to evaluate the risk of DISH on all outcomes, and subdistribution hazard models were used to evaluate the effect of DISH on the cumulative incidence. All models were adjusted for age, sex, body mass index, blood pressure, diabetes, non-HDL cholesterol, packyears, renal function and C-reactive protein.
-
-
-                                           Results
-                                           DISH was present in 435 (9.4%) patients. After a median follow-up of 8.7 (IQR 5.0-12.0) years, 864 patients had died and 728 patients developed a MACE event. DISH was associated with an increased cumulative incidence of ischaemic stroke. After adjustment in cause-specific modelling, DISH remained significantly associated with ischaemic stroke (HR 1.55; 95% CI: 1.01, 2.38), but not with MACE (HR 0.99; 95% CI: 0.79, 1.24), myocardial infarction (HR 0.88; 95% CI: 0.59, 1.31), vascular death (HR 0.94; 95% CI: 0.68, 1.27) or all-cause mortality (HR 0.94; 95% CI: 0.77, 1.16).
-
-
-                                           Conclusion
-                                           The presence of DISH is independently associated with an increased incidence and risk for ischaemic stroke, but not with MACE, myocardial infarction, vascular death or all-cause mortality.},
+  
+                                             Objectives
+                                             Earlier retrospective studies have suggested a relation between DISH and cardiovascular disease, including myocardial infarction. The present study assessed the association between DISH and incidence of cardiovascular events and mortality in patients with high cardiovascular risk.
+  
+  
+                                             Methods
+                                             In this prospective cohort study, we included 4624 patients (mean age 58.4 years, 69.6% male) from the Second Manifestations of ARTerial disease cohort. The main end point was major cardiovascular events (MACE: stroke, myocardial infarction and vascular death). Secondary endpoints included all-cause mortality and separate vascular events. Cause-specific proportional hazard models were used to evaluate the risk of DISH on all outcomes, and subdistribution hazard models were used to evaluate the effect of DISH on the cumulative incidence. All models were adjusted for age, sex, body mass index, blood pressure, diabetes, non-HDL cholesterol, packyears, renal function and C-reactive protein.
+  
+  
+                                             Results
+                                             DISH was present in 435 (9.4%) patients. After a median follow-up of 8.7 (IQR 5.0-12.0) years, 864 patients had died and 728 patients developed a MACE event. DISH was associated with an increased cumulative incidence of ischaemic stroke. After adjustment in cause-specific modelling, DISH remained significantly associated with ischaemic stroke (HR 1.55; 95% CI: 1.01, 2.38), but not with MACE (HR 0.99; 95% CI: 0.79, 1.24), myocardial infarction (HR 0.88; 95% CI: 0.59, 1.31), vascular death (HR 0.94; 95% CI: 0.68, 1.27) or all-cause mortality (HR 0.94; 95% CI: 0.77, 1.16).
+  
+  
+                                             Conclusion
+                                             The presence of DISH is independently associated with an increased incidence and risk for ischaemic stroke, but not with MACE, myocardial infarction, vascular death or all-cause mortality.},
   all_ss_ids = {[0ebe8ab65571514718283cd2d8ac7277db3513c5]},
   automatic = {yes},
   citation-count = {5},
@@ -11462,9 +11486,9 @@ @article{Harl22
   pages = {rkac060},
   volume = {6},
   abstract = {Objectives: DISH has been associated with increased coronary artery calcifications and incident ischaemic stroke. The formation of bone along the spine may share pathways with calcium deposition in the aorta. We hypothesized that patients with DISH have increased vascular calcifications. Therefore we aimed to investigate the presence and extent of DISH in relation to thoracic aortic calcification (TAC) severity.
-                             Methods: This cross-sectional study included 4703 patients from the Second Manifestation of ARTerial disease cohort, consisting of patients with cardiovascular events or risk factors for cardiovascular disease. Chest radiographs were scored for DISH using the Resnick criteria. Different severities of TAC were scored arbitrarily from no TAC to mild, moderate or severe TAC. Using multivariate logistic regression, the associations between DISH and TAC were analysed with adjustments for age, sex, BMI, diabetes, smoking status, non-high-density lipoprotein cholesterol, cholesterol lowering drug usage, renal function and blood pressure.
-                             Results: A total of 442 patients (9.4\%) had evidence of DISH and 1789 (38\%) patients had TAC. The prevalence of DISH increased from 6.6\% in the no TAC group to 10.8\% in the mild, 14.3\% in the moderate and 17.1\% in the severe TAC group. After adjustments, DISH was significantly associated with the presence of TAC [odds ratio (OR) 1.46 [95\% CI 1.17, 1.82)]. In multinomial analyses, DISH was associated with moderate TAC [OR 1.43 (95\% CI 1.06, 1.93)] and severe TAC [OR 1.67 (95\% CI 1.19, 2.36)].
-                             Conclusions: Subjects with DISH have increased TACs, providing further evidence that patients with DISH have an increased burden of vascular calcifications.},
+                               Methods: This cross-sectional study included 4703 patients from the Second Manifestation of ARTerial disease cohort, consisting of patients with cardiovascular events or risk factors for cardiovascular disease. Chest radiographs were scored for DISH using the Resnick criteria. Different severities of TAC were scored arbitrarily from no TAC to mild, moderate or severe TAC. Using multivariate logistic regression, the associations between DISH and TAC were analysed with adjustments for age, sex, BMI, diabetes, smoking status, non-high-density lipoprotein cholesterol, cholesterol lowering drug usage, renal function and blood pressure.
+                               Results: A total of 442 patients (9.4\%) had evidence of DISH and 1789 (38\%) patients had TAC. The prevalence of DISH increased from 6.6\% in the no TAC group to 10.8\% in the mild, 14.3\% in the moderate and 17.1\% in the severe TAC group. After adjustments, DISH was significantly associated with the presence of TAC [odds ratio (OR) 1.46 [95\% CI 1.17, 1.82)]. In multinomial analyses, DISH was associated with moderate TAC [OR 1.43 (95\% CI 1.06, 1.93)] and severe TAC [OR 1.67 (95\% CI 1.19, 2.36)].
+                               Conclusions: Subjects with DISH have increased TACs, providing further evidence that patients with DISH have an increased burden of vascular calcifications.},
   file = {PubMed entry:http\://www.ncbi.nlm.nih.gov/pubmed/35993014:text/html},
   pmid = {35993014},
   ss_id = {f8fb57d8601ac0189ea80c6412232ac6771e51f5},
@@ -11549,7 +11573,7 @@ @article{Heba20
   url = {http://dx.doi.org/10.1055/s-0040-1713119},
   volume = {24},
   abstract = {No official data exist on the status of musculoskeletal (MSK) radiology in Europe. The Committee for National Societies conducted an international survey to understand the status of training, subspecialization, and local practice among the European Society of Musculoskeletal Radiology (ESSR) partner societies. This article reports the results of that survey. An online questionnaire was distributed to all 26 European national associations that act as official partner societies of the ESSR. The 24 questions were subdivided into six sections: society structure, relationship with the national radiological society, subspecialization, present radiology practice, MSK interventional procedures, and MSK ultrasound. The findings of our study show a lack of standardized training and/or accreditation methods in the field of MSK radiology at a national level. The European diploma in musculoskeletal radiology is directed to partly overcome this problem; however, this certification is still underrecognized. Using certification methods, a more homogeneous European landscape could be created in the future with a view to subspecialist training. MSK ultrasound and MSK interventional procedures should be performed by a health professional with a solid knowledge of the relevant imaging modalities and sufficient training in MSK radiology. Recognition of MSK radiology as an official subspecialty would make the field more attractive for younger colleagues as well as attracting the brightest and best, an important key to further development of both clinical and academic radiology.
-                                     Key Points},
+                                       Key Points},
   all_ss_ids = {[3f323518a65798690c99227bd7984cd50a8cdbc1]},
   automatic = {yes},
   citation-count = {9},
@@ -11673,12 +11697,12 @@ @article{Hend23
   doi = {https://doi.org/10.1007/s00330-022-09205-4},
   url = {https://link.springer.com/article/10.1007/s00330-022-09205-4},
   abstract = {Objectives: To assess how an artificial intelligence (AI) algorithm performs against five experienced musculoskeletal radiologists in diagnosing scaphoid fractures and whether it aids their diagnosis on conventional multi-view radiographs.
-
-                            Methods: Four datasets of conventional hand, wrist, and scaphoid radiographs were retrospectively acquired at two hospitals (hospitals A and B). Dataset 1 (12,990 radiographs from 3353 patients, hospital A) and dataset 2 (1117 radiographs from 394 patients, hospital B) were used for training and testing a scaphoid localization and laterality classification component. Dataset 3 (4316 radiographs from 840 patients, hospital A) and dataset 4 (688 radiographs from 209 patients, hospital B) were used for training and testing the fracture detector. The algorithm was compared with the radiologists in an observer study. Evaluation metrics included sensitivity, specificity, positive predictive value (PPV), area under the characteristic operating curve (AUC), Cohen's kappa coefficient (k), fracture localization precision, and reading time.
-
-                            Results: The algorithm detected scaphoid fractures with a sensitivity of 72%, specificity of 93%, PPV of 81%, and AUC of 0.88. The AUC of the algorithm did not differ from each radiologist (0.87 [radiologists' mean], p >=.05). AI assistance improved five out of ten pairs of inter-observer Cohen's k agreements (p <.05) and reduced reading time in four radiologists (p <.001), but did not improve other metrics in the majority of radiologists (p >=.05).
-
-                            Conclusions: The AI algorithm detects scaphoid fractures on conventional multi-view radiographs at the level of five experienced musculoskeletal radiologists and could significantly shorten their reading time.},
+  
+                              Methods: Four datasets of conventional hand, wrist, and scaphoid radiographs were retrospectively acquired at two hospitals (hospitals A and B). Dataset 1 (12,990 radiographs from 3353 patients, hospital A) and dataset 2 (1117 radiographs from 394 patients, hospital B) were used for training and testing a scaphoid localization and laterality classification component. Dataset 3 (4316 radiographs from 840 patients, hospital A) and dataset 4 (688 radiographs from 209 patients, hospital B) were used for training and testing the fracture detector. The algorithm was compared with the radiologists in an observer study. Evaluation metrics included sensitivity, specificity, positive predictive value (PPV), area under the characteristic operating curve (AUC), Cohen's kappa coefficient (k), fracture localization precision, and reading time.
+  
+                              Results: The algorithm detected scaphoid fractures with a sensitivity of 72%, specificity of 93%, PPV of 81%, and AUC of 0.88. The AUC of the algorithm did not differ from each radiologist (0.87 [radiologists' mean], p >=.05). AI assistance improved five out of ten pairs of inter-observer Cohen's k agreements (p <.05) and reduced reading time in four radiologists (p <.001), but did not improve other metrics in the majority of radiologists (p >=.05).
+  
+                              Conclusions: The AI algorithm detects scaphoid fractures on conventional multi-view radiographs at the level of five experienced musculoskeletal radiologists and could significantly shorten their reading time.},
   file = {Hend23.pdf:pdf\\Hend23.pdf:PDF},
   journal = ER,
   volume = {33},
@@ -11698,19 +11722,19 @@ @article{Hend23a
   url = {https://doi.org/10.1007/s00330-023-09826-3},
   volume = {33},
   abstract = {Objective
-                          To study trends in the incidence of reported pulmonary nodules and stage I lung cancer in chest CT.
-
-                          Methods
-                          We analyzed the trends in the incidence of detected pulmonary nodules and stage I lung cancer in chest CT scans in the period between 2008 and 2019. Imaging metadata and radiology reports from all chest CT studies were collected from two large Dutch hospitals. A natural language processing algorithm was developed to identify studies with any reported pulmonary nodule.
-
-                          Results
-                          Between 2008 and 2019, a total of 74,803 patients underwent 166,688 chest CT examinations at both hospitals combined. During this period, the annual number of chest CT scans increased from 9955 scans in 6845 patients in 2008 to 20,476 scans in 13,286 patients in 2019. The proportion of patients in whom nodules (old or new) were reported increased from 38% (2595/6845) in 2008 to 50% (6654/13,286) in 2019. The proportion of patients in whom significant new nodules (>= 5 mm) were reported increased from 9% (608/6954) in 2010 to 17% (1660/9883) in 2017. The number of patients with new nodules and corresponding stage I lung cancer diagnosis tripled and their proportion doubled, from 0.4% (26/6954) in 2010 to 0.8% (78/9883) in 2017.
-
-                          Conclusion
-                          The identification of incidental pulmonary nodules in chest CT has steadily increased over the past decade and has been accompanied by more stage I lung cancer diagnoses.
-
-                          Clinical relevance statement
-                          These findings stress the importance of identifying and efficiently managing incidental pulmonary nodules in routine clinical practice.},
+                            To study trends in the incidence of reported pulmonary nodules and stage I lung cancer in chest CT.
+  
+                            Methods
+                            We analyzed the trends in the incidence of detected pulmonary nodules and stage I lung cancer in chest CT scans in the period between 2008 and 2019. Imaging metadata and radiology reports from all chest CT studies were collected from two large Dutch hospitals. A natural language processing algorithm was developed to identify studies with any reported pulmonary nodule.
+  
+                            Results
+                            Between 2008 and 2019, a total of 74,803 patients underwent 166,688 chest CT examinations at both hospitals combined. During this period, the annual number of chest CT scans increased from 9955 scans in 6845 patients in 2008 to 20,476 scans in 13,286 patients in 2019. The proportion of patients in whom nodules (old or new) were reported increased from 38% (2595/6845) in 2008 to 50% (6654/13,286) in 2019. The proportion of patients in whom significant new nodules (>= 5 mm) were reported increased from 9% (608/6954) in 2010 to 17% (1660/9883) in 2017. The number of patients with new nodules and corresponding stage I lung cancer diagnosis tripled and their proportion doubled, from 0.4% (26/6954) in 2010 to 0.8% (78/9883) in 2017.
+  
+                            Conclusion
+                            The identification of incidental pulmonary nodules in chest CT has steadily increased over the past decade and has been accompanied by more stage I lung cancer diagnoses.
+  
+                            Clinical relevance statement
+                            These findings stress the importance of identifying and efficiently managing incidental pulmonary nodules in routine clinical practice.},
   all_ss_ids = {['9a589d8cc38a4770cf3d5819fc363a814902bb42']},
   file = {Hend23a.pdf:pdf\\Hend23a.pdf:PDF},
   gscites = {8},
@@ -11730,17 +11754,17 @@ @article{Hend23b
   number = {1},
   algorithm = {https://grand-challenge.org/algorithms/lung-nodule-detector-for-ct/},
   abstract = {Abstract
-                                        Background
-                                        Outside a screening program, early-stage lung cancer is generally diagnosed after the detection of incidental nodules in clinically ordered chest CT scans. Despite the advances in artificial intelligence (AI) systems for lung cancer detection, clinical validation of these systems is lacking in a non-screening setting.
-
-                                        Method
-                                        We developed a deep learning-based AI system and assessed its performance for the detection of actionable benign nodules (requiring follow-up), small lung cancers, and pulmonary metastases in CT scans acquired in two Dutch hospitals (internal and external validation). A panel of five thoracic radiologists labeled all nodules, and two additional radiologists verified the nodule malignancy status and searched for any missed cancers using data from the national Netherlands Cancer Registry. The detection performance was evaluated by measuring the sensitivity at predefined false positive rates on a free receiver operating characteristic curve and was compared with the panel of radiologists.
-
-                                        Results
-                                        On the external test set (100 scans from 100 patients), the sensitivity of the AI system for detecting benign nodules, primary lung cancers, and metastases is respectively 94.3% (82/87, 95% CI: 88.1-98.8%), 96.9% (31/32, 95% CI: 91.7-100%), and 92.0% (104/113, 95% CI: 88.5-95.5%) at a clinically acceptable operating point of 1 false positive per scan (FP/s). These sensitivities are comparable to or higher than the radiologists, albeit with a slightly higher FP/s (average difference of 0.6).
-
-                                        Conclusions
-                                        The AI system reliably detects benign and malignant pulmonary nodules in clinically indicated CT scans and can potentially assist radiologists in this setting.},
+                                          Background
+                                          Outside a screening program, early-stage lung cancer is generally diagnosed after the detection of incidental nodules in clinically ordered chest CT scans. Despite the advances in artificial intelligence (AI) systems for lung cancer detection, clinical validation of these systems is lacking in a non-screening setting.
+  
+                                          Method
+                                          We developed a deep learning-based AI system and assessed its performance for the detection of actionable benign nodules (requiring follow-up), small lung cancers, and pulmonary metastases in CT scans acquired in two Dutch hospitals (internal and external validation). A panel of five thoracic radiologists labeled all nodules, and two additional radiologists verified the nodule malignancy status and searched for any missed cancers using data from the national Netherlands Cancer Registry. The detection performance was evaluated by measuring the sensitivity at predefined false positive rates on a free receiver operating characteristic curve and was compared with the panel of radiologists.
+  
+                                          Results
+                                          On the external test set (100 scans from 100 patients), the sensitivity of the AI system for detecting benign nodules, primary lung cancers, and metastases is respectively 94.3% (82/87, 95% CI: 88.1-98.8%), 96.9% (31/32, 95% CI: 91.7-100%), and 92.0% (104/113, 95% CI: 88.5-95.5%) at a clinically acceptable operating point of 1 false positive per scan (FP/s). These sensitivities are comparable to or higher than the radiologists, albeit with a slightly higher FP/s (average difference of 0.6).
+  
+                                          Conclusions
+                                          The AI system reliably detects benign and malignant pulmonary nodules in clinically indicated CT scans and can potentially assist radiologists in this setting.},
   citation-count = {0},
   file = {Hend23b.pdf:pdf\Hend23b.pdf:PDF},
   journal = {Communications Medicine},
@@ -11758,20 +11782,20 @@ @article{Hend24
   doi = {10.1007/s00330-024-10744-1},
   url = {http://dx.doi.org/10.1007/s00330-024-10744-1},
   abstract = {Abstract
-                       Objectives
-                       To develop and validate an artificial intelligence (AI) system for measuring and detecting signs of carpal instability on conventional radiographs.
-
-                       Materials and methods
-                       Two case-control datasets of hand and wrist radiographs were retrospectively acquired at three hospitals (hospitals A, B, and C). Dataset 1 (2178 radiographs from 1993 patients, hospitals A and B, 2018-2019) was used for developing an AI system for measuring scapholunate (SL) joint distances, SL and capitolunate (CL) angles, and carpal arc interruptions. Dataset 2 (481 radiographs from 217 patients, hospital C, 2017-2021) was used for testing, and with a subsample (174 radiographs from 87 patients), an observer study was conducted to compare its performance to five clinicians. Evaluation metrics included mean absolute error (MAE), sensitivity, and specificity.
-
-                       Results
-                       Dataset 2 included 258 SL distances, 189 SL angles, 191 CL angles, and 217 carpal arc labels obtained from 217 patients (mean age, 51 years +- 23 [standard deviation]; 133 women). The MAE in measuring SL distances, SL angles, and CL angles was respectively 0.65 mm (95%CI: 0.59, 0.72), 7.9 degrees (95%CI: 7.0, 8.9), and 5.9 degrees (95%CI: 5.2, 6.6). The sensitivity and specificity for detecting arc interruptions were 83% (95%CI: 74, 91) and 64% (95%CI: 56, 71). The measurements were largely comparable to those of the clinicians, while arc interruption detections were more accurate than those of most clinicians.
-
-                       Conclusion
-                       This study demonstrates that a newly developed automated AI system accurately measures and detects signs of carpal instability on conventional radiographs.
-
-                       Clinical relevance statement
-                       This system has the potential to improve detections of carpal arc interruptions and could be a promising tool for supporting clinicians in detecting carpal instability.},
+                         Objectives
+                         To develop and validate an artificial intelligence (AI) system for measuring and detecting signs of carpal instability on conventional radiographs.
+  
+                         Materials and methods
+                         Two case-control datasets of hand and wrist radiographs were retrospectively acquired at three hospitals (hospitals A, B, and C). Dataset 1 (2178 radiographs from 1993 patients, hospitals A and B, 2018-2019) was used for developing an AI system for measuring scapholunate (SL) joint distances, SL and capitolunate (CL) angles, and carpal arc interruptions. Dataset 2 (481 radiographs from 217 patients, hospital C, 2017-2021) was used for testing, and with a subsample (174 radiographs from 87 patients), an observer study was conducted to compare its performance to five clinicians. Evaluation metrics included mean absolute error (MAE), sensitivity, and specificity.
+  
+                         Results
+                         Dataset 2 included 258 SL distances, 189 SL angles, 191 CL angles, and 217 carpal arc labels obtained from 217 patients (mean age, 51 years +- 23 [standard deviation]; 133 women). The MAE in measuring SL distances, SL angles, and CL angles was respectively 0.65 mm (95%CI: 0.59, 0.72), 7.9 degrees (95%CI: 7.0, 8.9), and 5.9 degrees (95%CI: 5.2, 6.6). The sensitivity and specificity for detecting arc interruptions were 83% (95%CI: 74, 91) and 64% (95%CI: 56, 71). The measurements were largely comparable to those of the clinicians, while arc interruption detections were more accurate than those of most clinicians.
+  
+                         Conclusion
+                         This study demonstrates that a newly developed automated AI system accurately measures and detects signs of carpal instability on conventional radiographs.
+  
+                         Clinical relevance statement
+                         This system has the potential to improve detections of carpal arc interruptions and could be a promising tool for supporting clinicians in detecting carpal instability.},
   all_ss_ids = {['3f8b7ed7aa269ba8d4c7f8b6e34f2e8e92f1c06e']},
   automatic = {yes},
   citation-count = {0},
@@ -11787,11 +11811,11 @@ @phdthesis{Hend24a
   title = {Artificial Intelligence for Computer Aided Diagnosis of Scaphoid Fractures and Associated Instability on Conventional Radiography},
   url = {https://research.tilburguniversity.edu/en/publications/artificial-intelligence-for-computer-aided-diagnosis-of-scaphoid-},
   abstract = {Injuries of the hand and wrist belong to the most common injuries in all age groups and impose a large burden on individuals and society. From all hand and wrist injuries, fractures in the scaphoid bone are among the most common and challenging injuries to diagnose due to their subtle presentation on initial radiographs. If not timely diagnosed, these fractures can lead to severe joint degeneration. Consequently, they are structurally overtreated out of precaution. These fractures lead to substantial healthcare costs and diminish the affected individual's quality of life. This thesis proposes the use of artificial intelligence (AI) to assist physicians in the diagnosis of scaphoid fractures and associated wrist joint (carpal) instability on initial hand and wrist radiographs.
-              In Chapter 2, we present the development and validation of an AI system that uses convolutional neural networks (CNNs) for the automated detection of scaphoid fractures in conventional frontal view radiographs. This system first identifies the contours of the scaphoid, isolates the region for analysis, and then provides a confidence score for the presence of a fracture. It also highlights image regions that were relevant to its decision. The system was trained with radiographs from the Radboud University Medical Center (Radboudumc) and was validated on radiographs from the Jeroen Bosch Hospital (JBZ). The system was found to detect scaphoid fractures just as well as 11 radiologists, which shows its potential to assist physicians in clinical practice.
-              In Chapter 3 (interlude), we reflect on the impact of the COVID-19 pandemic and describe a collaborative effort of the DIAG research group to develop an AI system for the automated triage of suspected COVID-19 patients. This system analyses a chest CT scan to assess the likelihood of COVID19 and the extent of pulmonary involvement. It was trained and validated with scans from Radboudumc and the Canisius-Wilhelmina Hospital. The assessments of the AI system were found to closely align with those of eight radiologists, which illustrates the capability of AI in addressing emergent global health crises that go beyond orthopaedic injuries.
-              In Chapter 4, we introduce a new AI system designed for the detection of scaphoid fractures using multiple radiographic views of the wrist. This CNN-based system gives a fracture score per anatomical region to localize potential fractures with even greater precision. We expanded and revised the dataset of radiographs from Radboudumc and then validated the system on radiographs from JBZ. We found that the system's performance was comparable to that of experienced musculoskeletal (MSK) radiologists, and that AI-assistance improved reading times and consistency of the assessments among the radiologists. The system did not significantly increase their diagnostic accuracy, which suggests it has most value as a diagnostic aid for less experienced practitioners.
-              In Chapter 5, we present an AI system for measuring and detecting signs of carpal instability on hand and wrist radiographs. By using CNNs and statistical models, this system provides measurements that are in line with clinical guidelines. These measurements included the scapholunate distance, scapholunate and capitolunate angle, and carpal arcs of Gilula. The system was developed with radiographs from both Radboudumc and JBZ and was validated on radiographs from Hospital Gelderse Vallei (ZGV). We found that the accuracy of the automated measurements was largely comparable to that of clinicians with different specialties, while it displayed better performance in identifying disruptions in carpal alignment. Therefore, the system may facilitate early detection and treatment of carpal instability, and potentially prevent long-term joint damage.
-              In the concluding Chapter 6, we synthesize the thesis's contributions and suggest future research directions. Currently, automated fracture detection in all carpal bones remains understudied and there is a lack of public datasets for the development and validation of wrist fracture detection systems. Furthermore, we underline the importance of integrating the investigated AI systems into a single computer aided detection system for clinical practice, which will greatly contribute to the valorization of the acquired research findings.},
+                In Chapter 2, we present the development and validation of an AI system that uses convolutional neural networks (CNNs) for the automated detection of scaphoid fractures in conventional frontal view radiographs. This system first identifies the contours of the scaphoid, isolates the region for analysis, and then provides a confidence score for the presence of a fracture. It also highlights image regions that were relevant to its decision. The system was trained with radiographs from the Radboud University Medical Center (Radboudumc) and was validated on radiographs from the Jeroen Bosch Hospital (JBZ). The system was found to detect scaphoid fractures just as well as 11 radiologists, which shows its potential to assist physicians in clinical practice.
+                In Chapter 3 (interlude), we reflect on the impact of the COVID-19 pandemic and describe a collaborative effort of the DIAG research group to develop an AI system for the automated triage of suspected COVID-19 patients. This system analyses a chest CT scan to assess the likelihood of COVID19 and the extent of pulmonary involvement. It was trained and validated with scans from Radboudumc and the Canisius-Wilhelmina Hospital. The assessments of the AI system were found to closely align with those of eight radiologists, which illustrates the capability of AI in addressing emergent global health crises that go beyond orthopaedic injuries.
+                In Chapter 4, we introduce a new AI system designed for the detection of scaphoid fractures using multiple radiographic views of the wrist. This CNN-based system gives a fracture score per anatomical region to localize potential fractures with even greater precision. We expanded and revised the dataset of radiographs from Radboudumc and then validated the system on radiographs from JBZ. We found that the system's performance was comparable to that of experienced musculoskeletal (MSK) radiologists, and that AI-assistance improved reading times and consistency of the assessments among the radiologists. The system did not significantly increase their diagnostic accuracy, which suggests it has most value as a diagnostic aid for less experienced practitioners.
+                In Chapter 5, we present an AI system for measuring and detecting signs of carpal instability on hand and wrist radiographs. By using CNNs and statistical models, this system provides measurements that are in line with clinical guidelines. These measurements included the scapholunate distance, scapholunate and capitolunate angle, and carpal arcs of Gilula. The system was developed with radiographs from both Radboudumc and JBZ and was validated on radiographs from Hospital Gelderse Vallei (ZGV). We found that the accuracy of the automated measurements was largely comparable to that of clinicians with different specialties, while it displayed better performance in identifying disruptions in carpal alignment. Therefore, the system may facilitate early detection and treatment of carpal instability, and potentially prevent long-term joint damage.
+                In the concluding Chapter 6, we synthesize the thesis's contributions and suggest future research directions. Currently, automated fracture detection in all carpal bones remains understudied and there is a lack of public datasets for the development and validation of wrist fracture detection systems. Furthermore, we underline the importance of integrating the investigated AI systems into a single computer aided detection system for clinical practice, which will greatly contribute to the valorization of the acquired research findings.},
   file = {Hend24a.pdf:pdf/Hend24a.pdf:PDF},
   optnote = {DIAG},
   school = {Tilburg University},
@@ -11801,22 +11825,6 @@ @phdthesis{Hend24a
   journal = {PhD thesis},
 }
 
-@inproceedings{Heri18,
-  author = {Hering, Alessa and Kuckertz, Sven and Heldmann, Stefan and Heinrich, Mattias},
-  booktitle = {Bildverarbeitung f\"ur die Medizin},
-  title = {Enhancing Label-Driven Deep Deformable Image Registration with Local Distance Metrics for State-of-the-Art Cardiac Motion Tracking},
-  pages = {309--314},
-  publisher = {Springer},
-  url = {https://arxiv.org/abs/1812.01859},
-  abstract = {While deep learning has achieved significant advances in accuracy for medical image segmentation, its benefits for deformable image registration have so far remained limited to reduced computation times. Previous work has either focused on replacing the iterative optimization of distance and smoothness terms with CNN-layers or using supervised approaches driven by labels. Our method is the first to combine the complementary strengths of global semantic information (represented by segmentation labels) and local distance metrics that help align surrounding structures. We demonstrate significant higher Dice scores (of 86.5%) for deformable cardiac image registration compared to classic registration (79.0%) as well as label-driven deep learning frameworks (83.4%).},
-  file = {Heri18.pdf:pdf\\Heri18.pdf:PDF},
-  optnote = {DIAG, RADIOLOGY},
-  year = {2019},
-  ss_id = {1216c70fa9e3eaa3ff5f7755e1d04147caed4818},
-  all_ss_ids = {['1216c70fa9e3eaa3ff5f7755e1d04147caed4818']},
-  gscites = {43},
-}
-
 @inproceedings{Heri19,
   author = {Hering, Alessa and van Ginneken, Bram and Heldmann, Stefan},
   title = {mlVIRNET: Multilevel Variational Image Registration Network},
@@ -11871,6 +11879,22 @@ @inproceedings{Heri19b
   gscites = {20},
 }
 
+@book{Heri19c,
+  author = {Hering, Alessa and Kuckertz, Sven and Heldmann, Stefan and Heinrich, Mattias P.},
+  title = {Enhancing Label-Driven Deep Deformable Image Registration with Local Distance Metrics for State-of-the-Art Cardiac Motion Tracking},
+  doi = {10.1007/978-3-658-25326-4_69},
+  year = {2019},
+  abstract = {Abstract unavailable},
+  url = {http://dx.doi.org/10.1007/978-3-658-25326-4_69},
+  file = {Heri19c.pdf:pdf\\Heri19c.pdf:PDF},
+  optnote = {DIAG, RADIOLOGY},
+  journal = {Informatik aktuell},
+  automatic = {yes},
+  all_ss_ids = {['1216c70fa9e3eaa3ff5f7755e1d04147caed4818']},
+  citation-count = {29},
+  pages = {309-314},
+}
+
 @inproceedings{Heri20,
   author = {Alessa Hering and Stefan Heldmann},
   booktitle = {Bildverarbeitung f\"ur die Medizin},
@@ -11956,7 +11980,7 @@ @article{Heri23
   url = {http://dx.doi.org/10.1109/TMI.2022.3213983},
   volume = {42},
   abstract = {Image registration is a fundamental medical image analysis task, and a wide variety of approaches have been proposed. However, only a few studies have comprehensively compared medical image registration approaches on a wide range of clinically relevant tasks, in part because of the   lack of availability of such diverse data. This limits the development of registration methods, the adoption of research advances into practice, and a fair benchmark across competing approaches. The Learn2Reg challenge addresses these limitations by providing a multi-task medical image registration benchmark for comprehensive characterisation of deformable registration algorithms. A continuous evaluation will be possible at \url{https://learn2reg.grand-challenge.org}.
-                                Learn2Reg covers a wide range of anatomies (brain, abdomen, and thorax), modalities (ultrasound, CT, MR), availability of annotations, as well as intra- and inter-patient registration evaluation. We established an easily accessible framework for training and validation of 3D registration methods, which enabled the compilation of results of over 65 individual method submissions from more than 20 unique teams. We used a complementary set of metrics, including robustness, accuracy, plausibility, and runtime, enabling unique insight into the current state-of-the-art of medical image registration. This paper describes datasets, tasks, evaluation methods and results of the challenge, and the results of further analysis of transferability to new datasets, the importance of label supervision, and resulting bias.},
+                                  Learn2Reg covers a wide range of anatomies (brain, abdomen, and thorax), modalities (ultrasound, CT, MR), availability of annotations, as well as intra- and inter-patient registration evaluation. We established an easily accessible framework for training and validation of 3D registration methods, which enabled the compilation of results of over 65 individual method submissions from more than 20 unique teams. We used a complementary set of metrics, including robustness, accuracy, plausibility, and runtime, enabling unique insight into the current state-of-the-art of medical image registration. This paper describes datasets, tasks, evaluation methods and results of the challenge, and the results of further analysis of transferability to new datasets, the importance of label supervision, and resulting bias.},
   all_ss_ids = {['2e09fa7387659a79f41d809ce40d32cc8c847bb7']},
   automatic = {yes},
   citation-count = {50},
@@ -11987,17 +12011,17 @@ @article{Heri24a
   doi = {10.1007/s11548-024-03181-4},
   url = {http://dx.doi.org/10.1007/s11548-024-03181-4},
   abstract = {Abstract
-                    Purpose
-                    AI-assisted techniques for lesion registration and segmentation have the potential to make CT-based tumor follow-up assessment faster and less reader-dependent. However, empirical evidence on the advantages of AI-assisted volumetric segmentation for lymph node and soft tissue metastases in follow-up CT scans is lacking. The aim of this study was to assess the efficiency, quality, and inter-reader variability of an AI-assisted workflow for volumetric segmentation of lymph node and soft tissue metastases in follow-up CT scans. Three hypotheses were tested: (H1) Assessment time for follow-up lesion segmentation is reduced using an AI-assisted workflow. (H2) The quality of the AI-assisted segmentation is non-inferior to the quality of fully manual segmentation. (H3) The inter-reader variability of the resulting segmentations is reduced with AI assistance.
-
-                    Materials and methods
-                    The study retrospectively analyzed 126 lymph nodes and 135 soft tissue metastases from 55 patients with stage IV melanoma. Three radiologists from two institutions performed both AI-assisted and manual segmentation, and the results were statistically analyzed and compared to a manual segmentation reference standard.
-
-                    Results
-                    AI-assisted segmentation reduced user interaction time significantly by 33% (222 s vs. 336 s), achieved similar Dice scores (0.80-0.84 vs. 0.81-0.82) and decreased inter-reader variability (median Dice 0.85-1.0 vs. 0.80-0.82; ICC 0.84 vs. 0.80), compared to manual segmentation.
-
-                    Conclusion
-                    The findings of this study support the use of AI-assisted registration and volumetric segmentation for lymph node and soft tissue metastases in follow-up CT scans. The AI-assisted workflow achieved significant time savings, similar segmentation quality, and reduced inter-reader variability compared to manual segmentation.},
+                      Purpose
+                      AI-assisted techniques for lesion registration and segmentation have the potential to make CT-based tumor follow-up assessment faster and less reader-dependent. However, empirical evidence on the advantages of AI-assisted volumetric segmentation for lymph node and soft tissue metastases in follow-up CT scans is lacking. The aim of this study was to assess the efficiency, quality, and inter-reader variability of an AI-assisted workflow for volumetric segmentation of lymph node and soft tissue metastases in follow-up CT scans. Three hypotheses were tested: (H1) Assessment time for follow-up lesion segmentation is reduced using an AI-assisted workflow. (H2) The quality of the AI-assisted segmentation is non-inferior to the quality of fully manual segmentation. (H3) The inter-reader variability of the resulting segmentations is reduced with AI assistance.
+  
+                      Materials and methods
+                      The study retrospectively analyzed 126 lymph nodes and 135 soft tissue metastases from 55 patients with stage IV melanoma. Three radiologists from two institutions performed both AI-assisted and manual segmentation, and the results were statistically analyzed and compared to a manual segmentation reference standard.
+  
+                      Results
+                      AI-assisted segmentation reduced user interaction time significantly by 33% (222 s vs. 336 s), achieved similar Dice scores (0.80-0.84 vs. 0.81-0.82) and decreased inter-reader variability (median Dice 0.85-1.0 vs. 0.80-0.82; ICC 0.84 vs. 0.80), compared to manual segmentation.
+  
+                      Conclusion
+                      The findings of this study support the use of AI-assisted registration and volumetric segmentation for lymph node and soft tissue metastases in follow-up CT scans. The AI-assisted workflow achieved significant time savings, similar segmentation quality, and reduced inter-reader variability compared to manual segmentation.},
   all_ss_ids = {['3a6a290dd40ff59f17702b045c4ebc7e40a72da6']},
   automatic = {yes},
   citation-count = {0},
@@ -12013,7 +12037,7 @@ @conference{Herm16
   booktitle = {DPA's Pathology Visions Conference 2016, San Diego, CA, US},
   year = {2016},
   abstract = {Research into cancer biomarkers often comprises testing of a number of potentially relevant markers in tissue sections. Use of single antibody immunohistochemistry, however, limits the amount of information available from the analysis. One single component can be visualized at a time, prohibiting the simultaneous assessment of multiple markers. Immunofluorescence is a widely used alternative, enabling two or three simultaneous markers. In, for instance, the study of tumor infiltrating lymphocytes one may wish to use multiplex immunophenotyping to identify the relevant subtypes of T-cells. This may require more than three markers in a single section. Also, because of availability, it is not always possible to compile panels of markers raised in different species to prevent cross-reactivity.
-                                                       This workshop will focus on highly multiplexed imaging, allowing up to 7 markers in a single section. The method comprises a staining procedure consisting of consecutive steps of IHC staining with tyramid signal amplification and microwave stripping of the antibody. Every antibody is labeled with a different fluorescent dye. Subsequent imaging is performed using a fully automated multispectral imaging setup. This approach enables the use of antibodies raised in the same species (e.g. mouse MAbs) and is capable of eliminating the effect of autofluorescence. The workshop will consist of two parts. First the specific staining techniques will be treated. In the second part we will focus on the imaging and analysis options. There will be ample opportunities for questions and discussion.},
+                                                         This workshop will focus on highly multiplexed imaging, allowing up to 7 markers in a single section. The method comprises a staining procedure consisting of consecutive steps of IHC staining with tyramid signal amplification and microwave stripping of the antibody. Every antibody is labeled with a different fluorescent dye. Subsequent imaging is performed using a fully automated multispectral imaging setup. This approach enables the use of antibodies raised in the same species (e.g. mouse MAbs) and is capable of eliminating the effect of autofluorescence. The workshop will consist of two parts. First the specific staining techniques will be treated. In the second part we will focus on the imaging and analysis options. There will be ample opportunities for questions and discussion.},
   file = {Herm16.pdf:pdf\\Herm16.pdf:PDF},
   optnote = {DIAG},
 }
@@ -12049,13 +12073,13 @@ @article{Herm19
   doi = {10.1681/ASN.2019020144},
   url = {https://jasn.asnjournals.org/content/30/10/1968},
   abstract = {Background: The development of deep neural networks is facilitating more advanced digital analysis of histopathologic images. We trained a convolutional neural network for multiclass segmentation of digitized kidney tissue sections stained with periodic acid-Schiff (PAS). Methods: We trained the network using multiclass annotations from 40 whole-slide images of stained
-                                                       kidney transplant biopsies and applied it to four independent data sets. We assessed multiclass segmentation performance by calculating Dice coefficients for ten tissue classes on ten transplant biopsies from the Radboud University Medical Center in Nijmegen, The Netherlands, and on ten transplant biopsies from an external center for validation. We also fully segmented 15 nephrectomy samples and calculated the
-                                                       network's glomerular detection rates and compared network-based measures with visually scored histologic components (Banff classification) in 82 kidney transplant biopsies.
-                                                       Results: The weighted mean Dice coefficients of all classes were 0.80 and 0.84 in ten kidney transplant biopsies from the Radboud center and the external center, respectively. The best segmented class was
-                                                       "glomeruli? in both data sets (Dice coefficients, 0.95 and 0.94, respectively), followed by "tubuli combined? and "interstitium.? The network detected 92.7% of all glomeruli in nephrectomy samples, with
-                                                       10.4% false positives. In whole transplant biopsies, the mean intraclass correlation coefficient for glomerular counting performed by pathologists versus the network was 0.94. We found significant correlations
-                                                       between visually scored histologic components and network-based measures. Conclusions: This study presents the first convolutional neural network formulticlass segmentation of PASstained
-                                                       nephrectomy samples and transplant biopsies. Our network may have utility for quantitative studies involving kidney histopathology across centers and provide opportunities for deep learning applications in routine diagnostics.},
+                                                         kidney transplant biopsies and applied it to four independent data sets. We assessed multiclass segmentation performance by calculating Dice coefficients for ten tissue classes on ten transplant biopsies from the Radboud University Medical Center in Nijmegen, The Netherlands, and on ten transplant biopsies from an external center for validation. We also fully segmented 15 nephrectomy samples and calculated the
+                                                         network's glomerular detection rates and compared network-based measures with visually scored histologic components (Banff classification) in 82 kidney transplant biopsies.
+                                                         Results: The weighted mean Dice coefficients of all classes were 0.80 and 0.84 in ten kidney transplant biopsies from the Radboud center and the external center, respectively. The best segmented class was
+                                                         "glomeruli? in both data sets (Dice coefficients, 0.95 and 0.94, respectively), followed by "tubuli combined? and "interstitium.? The network detected 92.7% of all glomeruli in nephrectomy samples, with
+                                                         10.4% false positives. In whole transplant biopsies, the mean intraclass correlation coefficient for glomerular counting performed by pathologists versus the network was 0.94. We found significant correlations
+                                                         between visually scored histologic components and network-based measures. Conclusions: This study presents the first convolutional neural network formulticlass segmentation of PASstained
+                                                         nephrectomy samples and transplant biopsies. Our network may have utility for quantitative studies involving kidney histopathology across centers and provide opportunities for deep learning applications in routine diagnostics.},
   file = {Herm19.pdf:pdf\\Herm19.pdf:PDF},
   optnote = {DIAG},
   pmid = {31488607},
@@ -12117,8 +12141,8 @@ @article{Herm22
   pages = {1418-1432},
   doi = {https://doi.org/10.1016/j.ajpath.2022.06.009},
   abstract = {In kidney transplant biopsies, both inflammation and chronic changes are important features that predict long-term graft survival. Quantitative scoring of these features is important for transplant diagnostics and kidney research. However, visual scoring is poorly reproducible and labor-intensive. The goal of this study was to investigate the potential of convolutional neural networks (CNNs) to quantify inflammation and chronic features in kidney transplant biopsies.
-                             A structure segmentation CNN and a lymphocyte detection CNN were applied on 125 whole-slide image pairs of PAS-, and CD3-stained slides. The CNN results were used to quantify healthy and sclerotic glomeruli, interstitial fibrosis, tubular atrophy, and inflammation both within non-atrophic and atrophic tubuli, and in areas of interstitial fibrosis. The computed tissue features showed high correlations with Banff lesion scores of five pathologists. Analyses on a small subset showed a moderate correlation towards higher CD3+ cell density within scarred regions and higher CD3+ cell count inside atrophic tubuli correlated with long-term change of estimated glomerular filtration rate.
-                             The presented CNNs are valid tools to yield objective quantitative information on glomeruli number, fibrotic tissue, and inflammation within scarred and non-scarred kidney parenchyma in a reproducible fashion. CNNs have the potential to improve kidney transplant diagnostics and will benefit the community as a novel method to generate surrogate endpoints for large-scale clinical studies.},
+                               A structure segmentation CNN and a lymphocyte detection CNN were applied on 125 whole-slide image pairs of PAS-, and CD3-stained slides. The CNN results were used to quantify healthy and sclerotic glomeruli, interstitial fibrosis, tubular atrophy, and inflammation both within non-atrophic and atrophic tubuli, and in areas of interstitial fibrosis. The computed tissue features showed high correlations with Banff lesion scores of five pathologists. Analyses on a small subset showed a moderate correlation towards higher CD3+ cell density within scarred regions and higher CD3+ cell count inside atrophic tubuli correlated with long-term change of estimated glomerular filtration rate.
+                               The presented CNNs are valid tools to yield objective quantitative information on glomeruli number, fibrotic tissue, and inflammation within scarred and non-scarred kidney parenchyma in a reproducible fashion. CNNs have the potential to improve kidney transplant diagnostics and will benefit the community as a novel method to generate surrogate endpoints for large-scale clinical studies.},
   file = {Herm22.pdf:pdf\\Herm22.pdf:PDF},
   optnote = {DIAG},
   pmid = {35843265},
@@ -12155,10 +12179,10 @@ @article{Heuv16
   pages = {241 - 251},
   doi = {10.1016/j.nicl.2016.07.002},
   abstract = {In this paper a Computer Aided Detection (CAD) system is presented to automatically detect Cerebral Microbleeds (CMBs) in patients with Traumatic Brain Injury (TBI). It is believed that the presence of CMBs has clinical prognostic value in TBI patients. To study the contribution of CMBs in patient outcome, accurate detection of CMBs is required. Manual detection of CMBs in TBI patients is a time consuming task that is prone to errors, because CMBs are easily overlooked and are difficult to distinguish from blood vessels.
-
-                                                       This study included 33 TBI patients. Because of the laborious nature of manually annotating CMBs, only one trained expert manually annotated the CMBs in all 33 patients. A subset of ten TBI patients was annotated by six experts. Our CAD system makes use of both Susceptibility Weighted Imaging (SWI) and T1 weighted magnetic resonance images to detect CMBs. After pre-processing these images, a two-step approach was used for automated detection of CMBs. In the first step, each voxel was characterized by twelve features based on the dark and spherical nature of CMBs and a random forest classifier was used to identify CMB candidate locations. In the second step, segmentations were made from each identified candidate location. Subsequently an object-based classifier was used to remove false positive detections of the voxel classifier, by considering seven object-based features that discriminate between spherical objects (CMBs) and elongated objects (blood vessels). A guided user interface was designed for fast evaluation of the CAD system result. During this process, an expert checked each CMB detected by the CAD system.
-
-                                                       A Fleiss' kappa value of only 0.24 showed that the inter-observer variability for the TBI patients in this study was very large. An expert using the guided user interface reached an average sensitivity of 93%, which was significantly higher (p = 0.03) than the average sensitivity of 77% (sd 12.4%) that the six experts manually detected. Furthermore, with the use of this CAD system the reading time was substantially reduced from one hour to 13 minutes per patient, because the CAD system only detects on average 25.9 false positives per TBI patient, resulting in 0.29 false positives per definite CMB finding.},
+  
+                                                         This study included 33 TBI patients. Because of the laborious nature of manually annotating CMBs, only one trained expert manually annotated the CMBs in all 33 patients. A subset of ten TBI patients was annotated by six experts. Our CAD system makes use of both Susceptibility Weighted Imaging (SWI) and T1 weighted magnetic resonance images to detect CMBs. After pre-processing these images, a two-step approach was used for automated detection of CMBs. In the first step, each voxel was characterized by twelve features based on the dark and spherical nature of CMBs and a random forest classifier was used to identify CMB candidate locations. In the second step, segmentations were made from each identified candidate location. Subsequently an object-based classifier was used to remove false positive detections of the voxel classifier, by considering seven object-based features that discriminate between spherical objects (CMBs) and elongated objects (blood vessels). A guided user interface was designed for fast evaluation of the CAD system result. During this process, an expert checked each CMB detected by the CAD system.
+  
+                                                         A Fleiss' kappa value of only 0.24 showed that the inter-observer variability for the TBI patients in this study was very large. An expert using the guided user interface reached an average sensitivity of 93%, which was significantly higher (p = 0.03) than the average sensitivity of 77% (sd 12.4%) that the six experts manually detected. Furthermore, with the use of this CAD system the reading time was substantially reduced from one hour to 13 minutes per patient, because the CAD system only detects on average 25.9 false positives per TBI patient, resulting in 0.29 false positives per definite CMB finding.},
   file = {Heuv16.pdf:pdf\\Heuv16.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {27489772},
@@ -12180,9 +12204,9 @@ @inproceedings{Heuv17
   pages = {101390V},
   doi = {10.1117/12.2253671},
   abstract = {Worldwide, 99% of all maternal deaths occur in low-resource countries. Ultrasound imaging can be used to detect maternal risk factors, but requires a well-trained sonographer to obtain the biometric parameters of the fetus. One of the most important biometric parameters is the fetal Head Circumference (HC). The HC can be used to estimate the Gestational Age (GA) and assess the growth of the fetus. In this paper we propose a method to estimate the fetal HC with the use of the Obstetric Sweep Protocol (OSP). With the OSP the abdomen of pregnant women is imaged with the use of sweeps. These sweeps can be taught to somebody without any prior knowledge of ultrasound within a day.
-                                                       Both the OSP and the standard two-dimensional ultrasound image for HC assessment were acquired by an experienced gynecologist from fifty pregnant women in St. Luke's Hospital in Wolisso, Ethiopia. The reference HC from the standard two-dimensional ultrasound image was compared to both the manually measured HC and the automatically measured HC from the OSP data.
-                                                       The median difference between the estimated GA from the manual measured HC using the OSP and the reference standard was -1.1 days (Median Absolute Deviation (MAD) 7.7 days). The median difference between the estimated GA from the automatically measured HC using the OSP and the reference standard was -6.2 days (MAD 8.6 days).
-                                                       Therefore, it can be concluded that it is possible to estimate the fetal GA with simple obstetric sweeps with a deviation of only one week.},
+                                                         Both the OSP and the standard two-dimensional ultrasound image for HC assessment were acquired by an experienced gynecologist from fifty pregnant women in St. Luke's Hospital in Wolisso, Ethiopia. The reference HC from the standard two-dimensional ultrasound image was compared to both the manually measured HC and the automatically measured HC from the OSP data.
+                                                         The median difference between the estimated GA from the manual measured HC using the OSP and the reference standard was -1.1 days (Median Absolute Deviation (MAD) 7.7 days). The median difference between the estimated GA from the automatically measured HC using the OSP and the reference standard was -6.2 days (MAD 8.6 days).
+                                                         Therefore, it can be concluded that it is possible to estimate the fetal GA with simple obstetric sweeps with a deviation of only one week.},
   file = {Heuv17.pdf:pdf\\Heuv17.pdf:PDF},
   optnote = {DIAG, MUSIC, RADIOLOGY},
   month = {3},
@@ -12296,31 +12320,31 @@ @conference{Heuv19
   booktitle = DBME,
   year = {2019},
   abstract = {Worldwide, 99% of all maternal deaths occur in developing countries. Ultrasound can be used
-                                                       to detect maternal risk factors, but this technique is rarely used in developing countries
-                                                       because it is too expensive, and it requires a trained sonographer to acquire and interpret the
-                                                       ultrasound images. In this work we use a low-cost ultrasound device which was combined
-                                                       with the obstetric sweep protocol (OSP) and deep learning algorithms to automatically detect
-                                                       maternal risk factors. The OSP can be taught to any health care worker without prior
-                                                       knowledge of ultrasound within one day, so there is no need for a trained sonographer.
-                                                       The OSP was acquired from 318 pregnant women using the low-cost MicrUs (Telemed
-                                                       Ultrasound Medical Systems, Milan, Italy) in Ethiopia. Two deep learning networks and two
-                                                       random forest classifiers were trained to automatically detect twin pregnancies, estimate
-                                                       gestational age (GA) and determine fetal presentation. The first deep learning network
-                                                       performs a frame classification, which was used to automatically separate the six sweeps of
-                                                       the OSP and automatically detect the fetal head and torso. The second deep learning network
-                                                       was trained to measure the fetal head circumference (HC) using all frames in which the first
-                                                       deep learning system detected the fetal head. The HC was used to determine the GA. Two
-                                                       random forest classifiers were trained to detect twin pregnancies and determine fetal
-                                                       presentation using the frame classification of the first deep learning network.
-                                                       The developed algorithm can automatically estimate the GA with an interquartile range of
-                                                       15.2 days, correctly detected 61% of all twins with a specificity of 99%, and correctly detect
-                                                       all 31 breech presentations and 215 of the 216 cephalic presentations. The developed
-                                                       algorithm can be computed in less than two seconds, making real-time application feasible.
-                                                       The presented system is able to determine three maternal risk factors using the OSP. The OSP
-                                                       can be acquired without the need of a trained sonographer, which makes widespread obstetric
-                                                       ultrasound affordable and fast to implement in resource-limited settings. This makes is
-                                                       possible to refer pregnant women in time to a hospital to receive treatment when risk factors
-                                                       are detected.},
+                                                         to detect maternal risk factors, but this technique is rarely used in developing countries
+                                                         because it is too expensive, and it requires a trained sonographer to acquire and interpret the
+                                                         ultrasound images. In this work we use a low-cost ultrasound device which was combined
+                                                         with the obstetric sweep protocol (OSP) and deep learning algorithms to automatically detect
+                                                         maternal risk factors. The OSP can be taught to any health care worker without prior
+                                                         knowledge of ultrasound within one day, so there is no need for a trained sonographer.
+                                                         The OSP was acquired from 318 pregnant women using the low-cost MicrUs (Telemed
+                                                         Ultrasound Medical Systems, Milan, Italy) in Ethiopia. Two deep learning networks and two
+                                                         random forest classifiers were trained to automatically detect twin pregnancies, estimate
+                                                         gestational age (GA) and determine fetal presentation. The first deep learning network
+                                                         performs a frame classification, which was used to automatically separate the six sweeps of
+                                                         the OSP and automatically detect the fetal head and torso. The second deep learning network
+                                                         was trained to measure the fetal head circumference (HC) using all frames in which the first
+                                                         deep learning system detected the fetal head. The HC was used to determine the GA. Two
+                                                         random forest classifiers were trained to detect twin pregnancies and determine fetal
+                                                         presentation using the frame classification of the first deep learning network.
+                                                         The developed algorithm can automatically estimate the GA with an interquartile range of
+                                                         15.2 days, correctly detected 61% of all twins with a specificity of 99%, and correctly detect
+                                                         all 31 breech presentations and 215 of the 216 cephalic presentations. The developed
+                                                         algorithm can be computed in less than two seconds, making real-time application feasible.
+                                                         The presented system is able to determine three maternal risk factors using the OSP. The OSP
+                                                         can be acquired without the need of a trained sonographer, which makes widespread obstetric
+                                                         ultrasound affordable and fast to implement in resource-limited settings. This makes is
+                                                         possible to refer pregnant women in time to a hospital to receive treatment when risk factors
+                                                         are detected.},
   file = {Heuv19.pdf:pdf\\Heuv19.pdf:PDF},
   optnote = {DIAG, MUSIC, RADIOLOGY},
 }
@@ -12735,10 +12759,10 @@ @inproceedings{Holl16
   series = SPIE,
   doi = {10.1117/12.2216810},
   abstract = {The sensitivity of mammograms is low for women with dense breasts, since cancers may be masked by dense tissue. In this study, we investigated methods to identify women with density patterns associated with a high masking risk. Risk measures are derived from volumetric breast density maps.
-                                                       We used the last negative screening mammograms of 93 women who subsequently presented with an interval cancer (IC), and, as controls, 930 randomly selected normal screening exams from women without cancer. Volumetric breast density maps were computed from the mammograms, which provide the dense tissue thickness at each location. These were used to compute absolute and percentage glandular tissue volume.
-                                                       We modeled the masking risk for each pixel location using the absolute and percentage dense tissue thickness and we investigated the effect of taking the cancer location probability distribution (CLPD) into account.
-                                                       For each method, we selected cases with the highest masking measure (by thresholding) and computed the fraction of ICs as a function of the fraction of controls selected. The latter can be interpreted as the negative supplemental screening rate (NSSR).
-                                                       Between the models, when incorporating CLPD, no significant differences were found. In general, the methods performed better when CLPD was included. At higher NSSRs some of the investigated masking measures had a significantly higher performance than volumetric breast density. These measures may therefore serve as an alternative to identify women with a high risk for a masked cancer.},
+                                                         We used the last negative screening mammograms of 93 women who subsequently presented with an interval cancer (IC), and, as controls, 930 randomly selected normal screening exams from women without cancer. Volumetric breast density maps were computed from the mammograms, which provide the dense tissue thickness at each location. These were used to compute absolute and percentage glandular tissue volume.
+                                                         We modeled the masking risk for each pixel location using the absolute and percentage dense tissue thickness and we investigated the effect of taking the cancer location probability distribution (CLPD) into account.
+                                                         For each method, we selected cases with the highest masking measure (by thresholding) and computed the fraction of ICs as a function of the fraction of controls selected. The latter can be interpreted as the negative supplemental screening rate (NSSR).
+                                                         Between the models, when incorporating CLPD, no significant differences were found. In general, the methods performed better when CLPD was included. At higher NSSRs some of the investigated masking measures had a significantly higher performance than volumetric breast density. These measures may therefore serve as an alternative to identify women with a high risk for a masked cancer.},
   file = {:pdf/Holl16.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   month = {3},
@@ -12759,8 +12783,8 @@ @inproceedings{Holl16a
   pages = {183-189},
   doi = {10.1007/978-3-319-41546-8_24},
   abstract = {During mammographic acquisition, the breast is compressed between the breast support plate and the compression paddle to improve image quality and reduce dose, among other reasons. The applied force, which is measured by the imaging device, varies substantially, due to local guidelines, positioning, and breast size. Force measurements may not be very relevant though, because the amount of compression will be related to pressure rather than force. With modern image analysis techniques, the contact surface of the breast under compression can be determined and pressure can be computed retrospectively. In this study, we investigate if there is a relation between pressure applied to the breast during compression and screening performance.
-                                                       In a series of 113,464 screening exams from the Dutch breast cancer screening program we computed the compression pressure applied in the MLO projections of the right and left breasts. The exams were binned into five groups of increasing applied pressure, in such a way that each group contains 20% of the exams. Thresholds were 7.68, 9.18, 10.71 and 12.81 kPa. Screening performance measures were determined for each group. Differences across the groups were investigated with a Pearson's Chi Square test. It was found that PPV and the cancer detection rate vary significantly within the five groups (p = 0.001 and p = 0.011 respectively).The PPV was 25.4, 31.2, 32.7, 25.8 and 22.0 for the five groups with increasing pressure. The recall rate, false positive rate and specificity were not statistically significant from the expectation (p-values: 0.858, 0.088 and 0.094 respectively). Even though differences are not significant, there is a trend that the groups with a moderate pressure have a better performance compared to the first and last category.
-                                                       The results suggest that high pressure reduces detectability of breast cancer. The best screening results were found in the groups with a moderate pressure.},
+                                                         In a series of 113,464 screening exams from the Dutch breast cancer screening program we computed the compression pressure applied in the MLO projections of the right and left breasts. The exams were binned into five groups of increasing applied pressure, in such a way that each group contains 20% of the exams. Thresholds were 7.68, 9.18, 10.71 and 12.81 kPa. Screening performance measures were determined for each group. Differences across the groups were investigated with a Pearson's Chi Square test. It was found that PPV and the cancer detection rate vary significantly within the five groups (p = 0.001 and p = 0.011 respectively).The PPV was 25.4, 31.2, 32.7, 25.8 and 22.0 for the five groups with increasing pressure. The recall rate, false positive rate and specificity were not statistically significant from the expectation (p-values: 0.858, 0.088 and 0.094 respectively). Even though differences are not significant, there is a trend that the groups with a moderate pressure have a better performance compared to the first and last category.
+                                                         The results suggest that high pressure reduces detectability of breast cancer. The best screening results were found in the groups with a moderate pressure.},
   file = {:pdf/Holl16a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   gsid = {10535534135064321671},
@@ -12800,9 +12824,9 @@ @article{Holl17
   pages = {541--548},
   doi = {10.1007/s10549-017-4137-4},
   abstract = {Purpose: Fibroglandular tissue may mask breast cancers, thereby reducing the sensitivity of mammography. Here we investigate methods for identification of women at high risk of a masked tumor, who could benefit from additional imaging.
-                                                       Methods: The last negative screening mammograms of 111 women with interval cancer (IC) within 12 months after the examination and 1110 selected normal screening exams from women without cancer were used. From the mammograms volumetric breast density maps were computed, which provide the dense tissue thickness for each pixel location. With these maps, three measurements were derived: 1) Percent dense volume (PDV), 2) Percent area where dense tissue thickness exceeds 1cm (PDA), 3) Dense Tissue Masking Model (DTMM). Breast density was scored by a breast radiologist using BI-RADS. Women with heterogeneously and extremely dense breasts were considered at high masking risk. For each masking measure, mammograms were divided into a high and low risk category, such that the same proportion of the controls is at high masking risk as with BI-RADS.
-                                                       Results: Of the women with IC, 66.1%, 71.9%, 69.2% and 63.0% were categorized to be at high masking risk with PDV, PDA, DTMM and BI-RADS respectively, against 38.5% of the controls. The proportion of IC at high masking risk is statistically significantly different between BI-RADS and PDA (p-value 0.022). Differences between BI-RADS and PDV, or BI-RADS and DTMM, are not statistically significant.
-                                                       Conclusion: Measures based on density maps, and in particular PDA, are promising tools to identify women at high risk for a masked cancer.},
+                                                         Methods: The last negative screening mammograms of 111 women with interval cancer (IC) within 12 months after the examination and 1110 selected normal screening exams from women without cancer were used. From the mammograms volumetric breast density maps were computed, which provide the dense tissue thickness for each pixel location. With these maps, three measurements were derived: 1) Percent dense volume (PDV), 2) Percent area where dense tissue thickness exceeds 1cm (PDA), 3) Dense Tissue Masking Model (DTMM). Breast density was scored by a breast radiologist using BI-RADS. Women with heterogeneously and extremely dense breasts were considered at high masking risk. For each masking measure, mammograms were divided into a high and low risk category, such that the same proportion of the controls is at high masking risk as with BI-RADS.
+                                                         Results: Of the women with IC, 66.1%, 71.9%, 69.2% and 63.0% were categorized to be at high masking risk with PDV, PDA, DTMM and BI-RADS respectively, against 38.5% of the controls. The proportion of IC at high masking risk is statistically significantly different between BI-RADS and PDA (p-value 0.022). Differences between BI-RADS and PDV, or BI-RADS and DTMM, are not statistically significant.
+                                                         Conclusion: Measures based on density maps, and in particular PDA, are promising tools to identify women at high risk for a masked cancer.},
   file = {:pdf/Holl17.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {28161786},
@@ -12824,9 +12848,9 @@ @article{Holl17a
   pages = {3779--3797},
   doi = {10.1088/1361-6560/aa628f},
   abstract = {Fibroglandular tissue volume and percent density can be estimated in unprocessed mammograms using a physics-based method, which relies on an internal reference value representing the projection of fat only. However, pixels representing fat only may not be present in dense breasts, causing an underestimation of density measurements. In this work, we investigate alternative approaches for obtaining a tissue reference value to improve density estimations, particularly in dense breasts.
-                                                       Two of three investigated reference values (F1, F2) are percentiles of the pixel value distribution in the breast interior (the contact area of breast and compression paddle). F1 is determined in a small breast interior, which minimizes the risk that peripheral pixels are included in the measurement at the cost of increasing the chance that no proper reference can be found. F2 is obtained using a larger breast interior. The new approach which is developed for very dense breasts does not require the presence of a fatty tissue region. As reference region we select the densest region in the mammogram and assume that this represents a projection of entirely dense tissue embedded between the subcutaneous fatty tissue layers. By measuring the thickness of the fat layers a reference (F3) can be computed. To obtain accurate breast density estimates irrespective of breast composition we investigated a combination of the results of the three reference values. We collected 202 pairs of MRI's and digital mammograms from 119 women. We compared the percent dense volume estimates based on both modalities and calculated Pearson's correlation coefficients.
-                                                       With the references F1-F3 we found respectively a correlation of R=0.80, R=0.89 and R=0.74. Best results were obtained with the combination of the density estimations (R=0.90).
-                                                       Results show that better volumetric density estimates can be obtained with the hybrid method, in particular for dense breasts, when algorithms are combined to obtain a fatty tissue reference value depending on breast composition.},
+                                                         Two of three investigated reference values (F1, F2) are percentiles of the pixel value distribution in the breast interior (the contact area of breast and compression paddle). F1 is determined in a small breast interior, which minimizes the risk that peripheral pixels are included in the measurement at the cost of increasing the chance that no proper reference can be found. F2 is obtained using a larger breast interior. The new approach which is developed for very dense breasts does not require the presence of a fatty tissue region. As reference region we select the densest region in the mammogram and assume that this represents a projection of entirely dense tissue embedded between the subcutaneous fatty tissue layers. By measuring the thickness of the fat layers a reference (F3) can be computed. To obtain accurate breast density estimates irrespective of breast composition we investigated a combination of the results of the three reference values. We collected 202 pairs of MRI's and digital mammograms from 119 women. We compared the percent dense volume estimates based on both modalities and calculated Pearson's correlation coefficients.
+                                                         With the references F1-F3 we found respectively a correlation of R=0.80, R=0.89 and R=0.74. Best results were obtained with the combination of the density estimations (R=0.90).
+                                                         Results show that better volumetric density estimates can be obtained with the hybrid method, in particular for dense breasts, when algorithms are combined to obtain a fatty tissue reference value depending on breast composition.},
   file = {:pdf/Holl17a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {28230532},
@@ -12860,9 +12884,9 @@ @article{Holl17c
   doi = {10.1186/s13058-017-0917-3},
   url = {https://doi.org/10.1186/s13058-017-0917-3},
   abstract = {Background In mammography, breast compression is applied to reduce the thickness of the breast. While it is widely accepted that firm breast compression is needed to ensure acceptable image quality, guidelines remain vague about how much compression should be applied during mammogram acquisition. A quantitative parameter indicating the desirable amount of compression is not available. Consequently, little is known about the relationship between the amount of breast compression and breast cancer detectability. The purpose of this study is to determine the effect of breast compression pressure in mammography on breast cancer screening outcomes.
-                                                       Methods We used digital image analysis methods to determine breast volume, percent dense volume, and pressure from 132,776 examinations of 57,179 women participating in the Dutch population-based biennial breast cancer screening program. Pressure was estimated by dividing the compression force by the area of the contact surface between breast and compression paddle. The data was subdivided into quintiles of pressure and the number of screen-detected cancers, interval cancers, false positives, and true negatives were determined for each group. Generalized estimating equations were used to account for correlation between examinations of the same woman and for the effect of breast density and volume when estimating sensitivity, specificity, and other performance measures. Sensitivity was computed using interval cancers occurring between two screening rounds and using interval cancers within 12 months after screening. Pair-wise testing for significant differences was performed.
-                                                       Results Percent dense volume increased with increasing pressure, while breast volume decreased. Sensitivity in quintiles with increasing pressure was 82.0%, 77.1%, 79.8%, 71.1%, and 70.8%. Sensitivity based on interval cancers within 12 months was significantly lower in the highest pressure quintile compared to the third (84.3% vs 93.9%, p=0.034). Specificity was lower in the lowest pressure quintile (98.0%) compared to the second, third, and fourth group (98.5%, p<0.005). Specificity of the fifth quintile was 98.4%.
-                                                       Conclusion Results suggest that if too much pressure is applied during mammography this may reduce sensitivity. In contrast, if pressure is low this may decrease specificity.},
+                                                         Methods We used digital image analysis methods to determine breast volume, percent dense volume, and pressure from 132,776 examinations of 57,179 women participating in the Dutch population-based biennial breast cancer screening program. Pressure was estimated by dividing the compression force by the area of the contact surface between breast and compression paddle. The data was subdivided into quintiles of pressure and the number of screen-detected cancers, interval cancers, false positives, and true negatives were determined for each group. Generalized estimating equations were used to account for correlation between examinations of the same woman and for the effect of breast density and volume when estimating sensitivity, specificity, and other performance measures. Sensitivity was computed using interval cancers occurring between two screening rounds and using interval cancers within 12 months after screening. Pair-wise testing for significant differences was performed.
+                                                         Results Percent dense volume increased with increasing pressure, while breast volume decreased. Sensitivity in quintiles with increasing pressure was 82.0%, 77.1%, 79.8%, 71.1%, and 70.8%. Sensitivity based on interval cancers within 12 months was significantly lower in the highest pressure quintile compared to the third (84.3% vs 93.9%, p=0.034). Specificity was lower in the lowest pressure quintile (98.0%) compared to the second, third, and fourth group (98.5%, p<0.005). Specificity of the fifth quintile was 98.4%.
+                                                         Conclusion Results suggest that if too much pressure is applied during mammography this may reduce sensitivity. In contrast, if pressure is low this may decrease specificity.},
   file = {:pdf/Holl17c.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {29183348},
@@ -13172,16 +13196,16 @@ @article{Hoss21
   doi = {10.1007/s00330-021-08320-y},
   url = {https://doi.org/10.1007/s00330-021-08320-y},
   abstract = {Objectives
-                             To assess Prostate Imaging Reporting and Data System (PI-RADS)-trained deep learning (DL) algorithm performance and to investigate the effect of data size and prior knowledge on the detection of clinically significant prostate cancer (csPCa) in biopsy-naive men with a suspicion of PCa.
-
-                             Methods
-                             Multi-institution data included 2734 consecutive biopsy-naive men with elevated PSA levels (>= 3 ng/mL) that underwent multi-parametric MRI (mpMRI). mpMRI exams were prospectively reported using PI-RADS v2 by expert radiologists. A DL framework was designed and trained on center 1 data (n = 1952) to predict PI-RADS >= 4 (n = 1092) lesions from bi-parametric MRI (bpMRI). Experiments included varying the number of cases and the use of automatic zonal segmentation as a DL prior. Independent center 2 cases (n = 296) that included pathology outcome (systematic and MRI targeted biopsy) were used to compute performance for radiologists and DL. The performance of detecting PI-RADS 4-5 and Gleason > 6 lesions was assessed on 782 unseen cases (486 center 1, 296 center 2) using free-response ROC (FROC) and ROC analysis.
-
-                             Results
-                             The DL sensitivity for detecting PI-RADS >= 4 lesions was 87% (193/223, 95% CI: 82-91) at an average of 1 false positive (FP) per patient, and an AUC of 0.88 (95% CI: 0.84-0.91). The DL sensitivity for the detection of Gleason > 6 lesions was 85% (79/93, 95% CI: 77-83) @ 1 FP compared to 91% (85/93, 95% CI: 84-96) @ 0.3 FP for a consensus panel of expert radiologists. Data size and prior zonal knowledge significantly affected performance (4%, p<0.05).
-
-                             Conclusion
-                             PI-RADS-trained DL can accurately detect and localize Gleason > 6 lesions. DL could reach expert performance using substantially more than 2000 training cases, and DL zonal segmentation.},
+                               To assess Prostate Imaging Reporting and Data System (PI-RADS)-trained deep learning (DL) algorithm performance and to investigate the effect of data size and prior knowledge on the detection of clinically significant prostate cancer (csPCa) in biopsy-naive men with a suspicion of PCa.
+  
+                               Methods
+                               Multi-institution data included 2734 consecutive biopsy-naive men with elevated PSA levels (>= 3 ng/mL) that underwent multi-parametric MRI (mpMRI). mpMRI exams were prospectively reported using PI-RADS v2 by expert radiologists. A DL framework was designed and trained on center 1 data (n = 1952) to predict PI-RADS >= 4 (n = 1092) lesions from bi-parametric MRI (bpMRI). Experiments included varying the number of cases and the use of automatic zonal segmentation as a DL prior. Independent center 2 cases (n = 296) that included pathology outcome (systematic and MRI targeted biopsy) were used to compute performance for radiologists and DL. The performance of detecting PI-RADS 4-5 and Gleason > 6 lesions was assessed on 782 unseen cases (486 center 1, 296 center 2) using free-response ROC (FROC) and ROC analysis.
+  
+                               Results
+                               The DL sensitivity for detecting PI-RADS >= 4 lesions was 87% (193/223, 95% CI: 82-91) at an average of 1 false positive (FP) per patient, and an AUC of 0.88 (95% CI: 0.84-0.91). The DL sensitivity for the detection of Gleason > 6 lesions was 85% (79/93, 95% CI: 77-83) @ 1 FP compared to 91% (85/93, 95% CI: 84-96) @ 0.3 FP for a consensus panel of expert radiologists. Data size and prior zonal knowledge significantly affected performance (4%, p<0.05).
+  
+                               Conclusion
+                               PI-RADS-trained DL can accurately detect and localize Gleason > 6 lesions. DL could reach expert performance using substantially more than 2000 training cases, and DL zonal segmentation.},
   taverne_url = {https://repository.ubn.ru.nl/handle/2066/249485},
   ss_id = {21face92913ea6919840f59cd3cd5e84e70ebc7d},
   all_ss_ids = {['21face92913ea6919840f59cd3cd5e84e70ebc7d']},
@@ -13220,8 +13244,8 @@ @mastersthesis{Hout20
   author = {Thijs van den Hout},
   title = {Automatic muscle and fat segmentation in 3D abdominal CT images for body composition assessment},
   abstract = {Body composition is an informative biomarker in the treatment of cancer. In particular, low muscle mass has been associated with higher chemotherapy toxicity, shorter time to tumor progression, poorer surgical outcomes, impaired functional status, and shorter survival. However, because CT-based body composition assessment requires outlining the different tissues in the image, which is timeconsuming, its practical value is currently limited. To form an estimate of body composition, different tissues are often segmented manually in a single 2D slice from the abdomen.
-                              For use in both routine care and in research studies, automatic segmentation of the different tissue types in the abdomen is desirable.
-                              This study focuses on the development and testing of an automatic approach to segment muscle and fat tissue in the entire abdomen. The four classes of interest are skeletal muscle (SM), inter-muscular adipose tissue (IMAT), visceral adipose tissue (VAT), and subcutaneous adipose tissue (SAT). A deep neural network is trained on two-dimensional CT slices at the level of the third lumbar vertebra. Three experiments were carried out with the goal of improving the network with information from other, unannotated data sources. Active learning methods were applied to sample additional data to annotate and include in the training of the model. The proposed algorithm combines two models to segment muscle and fat in the entire abdomen and achieves state-of-the-art results. Dice scores of 0.91, 0.84, 0.97, and 0.97 were attained for SM, IMAT, VAT, and SAT, respectively, averaged over five locations throughout the abdomen.},
+                                For use in both routine care and in research studies, automatic segmentation of the different tissue types in the abdomen is desirable.
+                                This study focuses on the development and testing of an automatic approach to segment muscle and fat tissue in the entire abdomen. The four classes of interest are skeletal muscle (SM), inter-muscular adipose tissue (IMAT), visceral adipose tissue (VAT), and subcutaneous adipose tissue (SAT). A deep neural network is trained on two-dimensional CT slices at the level of the third lumbar vertebra. Three experiments were carried out with the goal of improving the network with information from other, unannotated data sources. Active learning methods were applied to sample additional data to annotate and include in the training of the model. The proposed algorithm combines two models to segment muscle and fat in the entire abdomen and achieves state-of-the-art results. Dice scores of 0.91, 0.84, 0.97, and 0.97 were attained for SM, IMAT, VAT, and SAT, respectively, averaged over five locations throughout the abdomen.},
   file = {Hout20.pdf:pdf/Hout20.pdf:PDF},
   optnote = {DIAG},
   school = {Radboud University Medical Center},
@@ -13613,7 +13637,7 @@ @phdthesis{Hump24
   year = {2024},
   url = {https://repository.ubn.ru.nl/handle/2066/307069},
   abstract = {The work presented in this thesis is focused on using DL for detecting and segmenting structures in CT scans. In Chapter 2, we present a method for localizing organs in 2D orthogonal views; this method combines the outputs of each orthogonal view to compose a 3D bounding box per organ. In Chapter 3, we apply a state-of-the-art segmentation algorithm using Convolutional Neural Networks (CNN) to segment the spleen, achieving performance comparable to that of an independent observer.
-      In an observer experiment, the radiologist rated the segmentation quality as 94% as ready for clinical use. Additionally, we performed an experiment to measure the splenic volume change over time. In Chapter 4, we segment the kidneys and kidney abnormalities, including cysts, lesions, masses, metastases, and tumors. We conducted an ablation study to analyze the performance of five components of the method. In Chapter 5, we explore the use of transfer learning to segment additional structures using a partially annotated dataset (a junction of publicly available datasets and data from public challenges). Finally, Chapter 6, provides the general discussion and summary of this thesis.},
+        In an observer experiment, the radiologist rated the segmentation quality as 94% as ready for clinical use. Additionally, we performed an experiment to measure the splenic volume change over time. In Chapter 4, we segment the kidneys and kidney abnormalities, including cysts, lesions, masses, metastases, and tumors. We conducted an ablation study to analyze the performance of five components of the method. In Chapter 5, we explore the use of transfer learning to segment additional structures using a partially annotated dataset (a junction of publicly available datasets and data from public challenges). Finally, Chapter 6, provides the general discussion and summary of this thesis.},
   copromotor = {Jacobs, C. and Lessmann, N.},
   file = {Hump24.pdf:pdf\\Hump24.pdf:PDF},
   optnote = {DIAG},
@@ -13790,9 +13814,9 @@ @article{Hänt24
   doi = {10.48550/ARXIV.2405.06463},
   year = {2024},
   abstract = {Purpose: To introduce a deep learning model capable of multi-organ segmentation in MRI scans, offering a solution to the current limitations in MRI analysis due to challenges in resolution, standardized intensity values, and variability in sequences.
-       Materials and Methods: he model was trained on 1,200 manually annotated MRI scans from the UK Biobank, 221 in-house MRI scans and 1228 CT scans, leveraging cross-modality transfer learning from CT segmentation models. A human-in-the-loop annotation workflow was employed to efficiently create high-quality segmentations. The model's performance was evaluated on NAKO and the AMOS22 dataset containing 600 and 60 MRI examinations. Dice Similarity Coefficient (DSC) and Hausdorff Distance (HD) was used to assess segmentation accuracy. The model will be open sourced.
-       Results: The model showcased high accuracy in segmenting well-defined organs, achieving Dice Similarity Coefficient (DSC) scores of 0.97 for the right and left lungs, and 0.95 for the heart. It also demonstrated robustness in organs like the liver (DSC: 0.96) and kidneys (DSC: 0.95 left, 0.95 right), which present more variability. However, segmentation of smaller and complex structures such as the portal and splenic veins (DSC: 0.54) and adrenal glands (DSC: 0.65 left, 0.61 right) revealed the need for further model optimization.
-       Conclusion: The proposed model is a robust, tool for accurate segmentation of 40 anatomical structures in MRI and CT images. By leveraging cross-modality learning and interactive annotation, the model achieves strong performance and generalizability across diverse datasets, making it a valuable resource for researchers and clinicians. It is open source and can be downloaded from https://github.com/hhaentze/MRSegmentator.},
+         Materials and Methods: he model was trained on 1,200 manually annotated MRI scans from the UK Biobank, 221 in-house MRI scans and 1228 CT scans, leveraging cross-modality transfer learning from CT segmentation models. A human-in-the-loop annotation workflow was employed to efficiently create high-quality segmentations. The model's performance was evaluated on NAKO and the AMOS22 dataset containing 600 and 60 MRI examinations. Dice Similarity Coefficient (DSC) and Hausdorff Distance (HD) was used to assess segmentation accuracy. The model will be open sourced.
+         Results: The model showcased high accuracy in segmenting well-defined organs, achieving Dice Similarity Coefficient (DSC) scores of 0.97 for the right and left lungs, and 0.95 for the heart. It also demonstrated robustness in organs like the liver (DSC: 0.96) and kidneys (DSC: 0.95 left, 0.95 right), which present more variability. However, segmentation of smaller and complex structures such as the portal and splenic veins (DSC: 0.54) and adrenal glands (DSC: 0.65 left, 0.61 right) revealed the need for further model optimization.
+         Conclusion: The proposed model is a robust, tool for accurate segmentation of 40 anatomical structures in MRI and CT images. By leveraging cross-modality learning and interactive annotation, the model achieves strong performance and generalizability across diverse datasets, making it a valuable resource for researchers and clinicians. It is open source and can be downloaded from https://github.com/hhaentze/MRSegmentator.},
   url = {https://arxiv.org/abs/2405.06463},
   file = {Hant24.pdf:pdf\\Hant24.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -14262,7 +14286,7 @@ @conference{Jaco12c
   booktitle = RSNA,
   year = {2012},
   abstract = {{BACKGROUND} Today, lung cancer is the most common and most deadly cancer in men and women worldwide. The recent positive results of the National Lung Screening Trial (NLST) [1] have provided clear scientific evidence that screening with low dose chest CT reduces lung cancer mortality. The National Comprehensive Cancer Network has updated their recommendations for screening and now strongly recommend the use of low-dose CT screening for individuals at high risk for lung cancer [2]. At least one health insurance company has started to cover the cost of lung screening. In its current form, however, large scale introduction of CT lung screening would put an enormous burden on radiologists. Building upon our clinical and technical experience in reading, image analysis and data processing for large screening trials in Europe (over 30,000 CT scans from 10,000 participants) and a careful review of the existing commercially available lung workstations, we have developed a new dedicated chest reading workstation with a number of innovations that allows for an optimized high throughput workflow to report on low dose chest CT scans. The application is currently available as a research prototype and is in use at five sites. It is used in clinical research and includes automated detection, linking, volumetry, interval change analysis, and estimation of malignancy for each nodule finding. A structured report for each patient is produced with follow-up recommendations according to several guidelines, including the upcoming revised Fleischner Society guidelines for the management of pulmonary nodules. {METHODOLOGY/APPLICATION} The workstation that will be demonstrated includes a number of innovations and enhancements compared to currently commercially available software. - Each scan is preprocessed and scan quality is automatically assessed. Scans with low quality, artifacts or underlying interstitial lung disease are automatically flagged. - Each scan is elastically registered to all prior scans of the same patient. Findings in prior scans are propagated and linked to findings in the current scan. All scans and processing results are preloaded in the background to ensure rapid reading. - Highly effective computerized detection (CAD) of solid nodules [3] and sub-solid nodules [4] is integrated. - High throughput reading with CAD as a first reader is supported. Users can accept/reject at a setting of on average 10 to 15 candidate lesions per scan and thus report much quicker than traditional thin sliding MIP viewing of the entire volume (also supported). - Each nodule is automatically characterized as solid, part-solid, or non-solid and nodules with benign characteristics are automatically flagged. Detected benign characteristics include calcifications and peri-fissural opacities (lymph nodes). - Volumetry, volume growth rate, mass and mass growth rate are automatically computed with advanced segmentation algorithms [5] that have been extended to handle sub-solid lesions and segment the solid core of part-solid nodules. If necessary, the user can interactively adjust segmentations and compare side by side with the corresponding finding in all available prior scans to detect and correct segmentation inconsistencies. - Findings are summarized in a structured report in HTML and PDF format that is stored in the database and can be sent to requesting physicians. Follow-up recommendation according to various screening algorithms and guidelines of leading Societies are included. {DEMONSTRATION STRATEGY} The exhibit will be accompanied by an informational poster that will highlight the key differences between the proposed workflow and current clinical practice. The poster will also explain algorithmic concepts that underlie the automated analysis. Attendees will be able to gain hands-on experience with the workstation and read cases and use nodule detection, automated and interactive volumetry, see the results of classification of different nodule types and produce structured reports with follow-up recommendations, all within
-                                                       minutes.},
+                                                         minutes.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -14671,16 +14695,19 @@ @conference{Jong22
 
 @article{Jurg24,
   author = {Jurgas, Artur and Wodzinski, Marek and D'Amato, Marina and van der Laak, Jeroen and Atzori, Manfredo and M\"{u}ller, Henning},
-  title = {Improving Quality Control of Whole Slide Images by Explicit Artifact Augmentation},
-  doi = {10.48550/ARXIV.2406.11538},
+  title = {Improving quality control of whole slide images by explicit artifact augmentation},
+  doi = {10.1038/s41598-024-68667-2},
   year = {2024},
   abstract = {The problem of artifacts in whole slide image acquisition, prevalent in both clinical workflows and research-oriented settings, necessitates human intervention and re-scanning. Overcoming this challenge requires developing quality control algorithms, that are hindered by the limited availability of relevant annotated data in histopathology. The manual annotation of ground-truth for artifact detection methods is expensive and time-consuming. This work addresses the issue by proposing a method dedicated to augmenting whole slide images with artifacts. The tool seamlessly generates and blends artifacts from an external library to a given histopathology dataset. The augmented datasets are then utilized to train artifact classification methods. The evaluation shows their usefulness in classification of the artifacts, where they show an improvement from 0.10 to 0.01 AUROC depending on the artifact type. The framework, model, weights, and ground-truth annotations are freely released to facilitate open science and reproducible research.},
-  url = {https://arxiv.org/abs/2406.11538},
+  url = {http://dx.doi.org/10.1038/s41598-024-68667-2},
   file = {Jurg24.pdf:pdf\\Jurg24.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
-  journal = {arXiv:2406.11538},
+  journal = {Scientific Reports},
   automatic = {yes},
   all_ss_ids = {['da98916e195198e3d82ab58316e452fa164400e4']},
+  citation-count = {0},
+  volume = {14},
+  pmid = {39090284},
 }
 
 @article{K.17,
@@ -14999,10 +15026,10 @@ @conference{Kall16b
   booktitle = RSNA,
   year = {2016},
   abstract = {PURPOSE In personalized breast cancer screening stratification is commonly based on breast density. It has been suggested though, that breast density is a too coarse descriptor for breast cancer risk. Several authors have developed texture features that are potentially more predictive of breast cancer. Yet, in several studies, strong correlation between both types of features is an issue. In this work we investigate a method to generate deep learning texture features that are independent of breast density.
-                                                       METHOD AND MATERIALS From the Dutch breast cancer screening program we collected 394 cancers and 1182 age matched healthy controls. To obtain mammograms without signs of cancerous tissue, we took the contralateral mammograms. For each image breast density was computed using automated software. Texture features were automatically learned from the data by means of techniques that are commonly used in deep learning. In the initial matching, breast density was on average higher in the cases than in the controls, as breast density is associated with breast cancer risk. Texture features and scores learned on this set (Td) are determined to be correlated to density. In order to obtain density independent features and scores (Ti) we balanced breast density over the cases and the controls by performing a rematching based on breast density. Non-matching cases and controls were excluded during training; in the testing phase all images were scored. We trained and tested Td and Ti to separate between cancers and controls with 5-fold cross-validation. We compared the performance of Td and Ti in terms of predictive power.
-                                                       RESULTS Spearman's rank correlation between density and Td was 0.81 (0.79-0.83). The density adjusted odds ratios for breast cancer were 1.15 (0.81-1.65), 1.40 (0.98-2.00), and 1.39 (0.92-2.09) for quartile 2-4 respectively, relative to quartile 1. For Ti the correlation with density was 0.00 (-0.06 - 0.05). The odds ratios were 1.15 (0.82-1.62), 1.33 (0.96-1.86), and 1.45 (1.05-2.01). The AUC for separating cancers from controls was 0.539 (0.506-0.572).
-                                                       CONCLUSION We developed a method for generating density independent texture features and scores. The obtained texture scores were significantly associated with breast cancer risk.
-                                                       CLINICAL RELEVANCE/APPLICATION The obtained density independent texture features may enhance breast cancer risk models beyond breast density, and as such offer opportunities to further optimize personalized breast cancer screening.},
+                                                         METHOD AND MATERIALS From the Dutch breast cancer screening program we collected 394 cancers and 1182 age matched healthy controls. To obtain mammograms without signs of cancerous tissue, we took the contralateral mammograms. For each image breast density was computed using automated software. Texture features were automatically learned from the data by means of techniques that are commonly used in deep learning. In the initial matching, breast density was on average higher in the cases than in the controls, as breast density is associated with breast cancer risk. Texture features and scores learned on this set (Td) are determined to be correlated to density. In order to obtain density independent features and scores (Ti) we balanced breast density over the cases and the controls by performing a rematching based on breast density. Non-matching cases and controls were excluded during training; in the testing phase all images were scored. We trained and tested Td and Ti to separate between cancers and controls with 5-fold cross-validation. We compared the performance of Td and Ti in terms of predictive power.
+                                                         RESULTS Spearman's rank correlation between density and Td was 0.81 (0.79-0.83). The density adjusted odds ratios for breast cancer were 1.15 (0.81-1.65), 1.40 (0.98-2.00), and 1.39 (0.92-2.09) for quartile 2-4 respectively, relative to quartile 1. For Ti the correlation with density was 0.00 (-0.06 - 0.05). The odds ratios were 1.15 (0.82-1.62), 1.33 (0.96-1.86), and 1.45 (1.05-2.01). The AUC for separating cancers from controls was 0.539 (0.506-0.572).
+                                                         CONCLUSION We developed a method for generating density independent texture features and scores. The obtained texture scores were significantly associated with breast cancer risk.
+                                                         CLINICAL RELEVANCE/APPLICATION The obtained density independent texture features may enhance breast cancer risk models beyond breast density, and as such offer opportunities to further optimize personalized breast cancer screening.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -15311,10 +15338,10 @@ @conference{Kars16
   booktitle = RSNA,
   year = {2016},
   abstract = {PURPOSE While firm breast compression is generally thought to be required for high quality mammograms the relationship between the amount of compression and screening performance has not been studied systematically. The aim of this study is to determine breast cancer screening outcomes in relation to the compression pressure applied during mammography.
-                                                       METHOD AND MATERIALS A consecutive series of 111,870 digital screening examinations performed in 53,684 women between July 2003 and December 2011 was collected from a screening centre operating within a nationwide breast cancer screening program. A total of 662 screen-detected cancers were included in this series, while 280 interval cancers corresponding to the selected exams were identified by linkage to the Dutch Cancer Registry. Using a research version of Volpara Density software (Volpara Solutions, Wellington, NZ) breast volume (V), dense tissue volume (VD), and volumetric density grade (VDG), were estimated for each exam, while compression pressure was estimated for medio-lateral oblique (MLO) view by dividing the compression force by the area of contact surface between the breast and the compression paddle. We calculated frequencies of recalls, screen-detected cancers, and interval cancers stratified by compression pressure in five groups and derived program sensitivity, specificity, and positive predictive value (PPV). In addition, for each group we computed mean values of V, VD, and VDG. For statistical analysis Pearson's Chi-squared test was used.
-                                                       RESULTS Screening outcomes were different in the five compression pressure groups (p=0.004). Program sensitivity decreased with increasing pressure (77.0%, 69.7%, 74.5%, 63.2%, 66.7%) (p=0.02), specificity was similar, and PPV was highest in the midrange of pressure (28.5%, 31.0%, 34.2%, 26.7%, 25.7%) (p=0.03). Cutoff points for pressure dividing the data in groups of 20% were 7.7, 9.2, 10.7, 12.8 kPa. V and VD both decreased with increasing pressure. Mean VDG moderately increased (1.75, 2.0, 2.2, 2.4, 2.8).
-                                                       CONCLUSION Results suggest that if too much pressure is applied during mammography this may increase interval cancer rates and decrease PPV.
-                                                       CLINICAL RELEVANCE/APPLICATION Controlling pressure during mammography is important to decrease the discomfort experienced by women, but it may also be required to optimize screening outcomes.},
+                                                         METHOD AND MATERIALS A consecutive series of 111,870 digital screening examinations performed in 53,684 women between July 2003 and December 2011 was collected from a screening centre operating within a nationwide breast cancer screening program. A total of 662 screen-detected cancers were included in this series, while 280 interval cancers corresponding to the selected exams were identified by linkage to the Dutch Cancer Registry. Using a research version of Volpara Density software (Volpara Solutions, Wellington, NZ) breast volume (V), dense tissue volume (VD), and volumetric density grade (VDG), were estimated for each exam, while compression pressure was estimated for medio-lateral oblique (MLO) view by dividing the compression force by the area of contact surface between the breast and the compression paddle. We calculated frequencies of recalls, screen-detected cancers, and interval cancers stratified by compression pressure in five groups and derived program sensitivity, specificity, and positive predictive value (PPV). In addition, for each group we computed mean values of V, VD, and VDG. For statistical analysis Pearson's Chi-squared test was used.
+                                                         RESULTS Screening outcomes were different in the five compression pressure groups (p=0.004). Program sensitivity decreased with increasing pressure (77.0%, 69.7%, 74.5%, 63.2%, 66.7%) (p=0.02), specificity was similar, and PPV was highest in the midrange of pressure (28.5%, 31.0%, 34.2%, 26.7%, 25.7%) (p=0.03). Cutoff points for pressure dividing the data in groups of 20% were 7.7, 9.2, 10.7, 12.8 kPa. V and VD both decreased with increasing pressure. Mean VDG moderately increased (1.75, 2.0, 2.2, 2.4, 2.8).
+                                                         CONCLUSION Results suggest that if too much pressure is applied during mammography this may increase interval cancer rates and decrease PPV.
+                                                         CLINICAL RELEVANCE/APPLICATION Controlling pressure during mammography is important to decrease the discomfort experienced by women, but it may also be required to optimize screening outcomes.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -15670,18 +15697,18 @@ @article{Kauc20
   pages = {3277-3294},
   doi = {10.1007/s00330-020-06727-7},
   abstract = {In Europe, lung cancer ranks third among the most common cancers, remaining the biggest killer. Since the publication of the first European Society of Radiology and European Respiratory Society joint white paper on lung cancer screening (LCS) in 2015, many new findings have been published and discussions have increased considerably. Thus, this updated expert opinion represents a narrative, non-systematic review of the evidence from LCS trials and description of the current practice of LCS as well as aspects that have not received adequate attention until now. Reaching out to the potential participants (persons at high risk), optimal communication and shared decision-making will be key starting points. Furthermore, standards for infrastructure, pathways and quality assurance are pivotal, including promoting tobacco cessation, benefits and harms, overdiagnosis, quality, minimum radiation exposure, definition of management of positive screen results and incidental findings linked to respective actions as well as cost-effectiveness. This requires a multidisciplinary team with experts from pulmonology and radiology as well as thoracic oncologists, thoracic surgeons, pathologists, family doctors, patient representatives and others. The ESR and ERS agree that Europe's health systems need to adapt to allow citizens to benefit from organised pathways, rather than unsupervised initiatives, to allow early diagnosis of lung cancer and reduce the mortality rate. Now is the time to set up and conduct demonstration programmes focusing, among other points, on methodology, standardisation, tobacco cessation, education on healthy lifestyle, cost-effectiveness and a central registry.
-
-                                                       Key Points
-
-                                                       * Pulmonologists and radiologists both have key roles in the set up of multidisciplinary LCS teams with experts from many other fields.
-
-                                                       * Pulmonologists identify people eligible for LCS, reach out to family doctors, share the decision-making process and promote tobacco cessation.
-
-                                                       * Radiologists ensure appropriate image quality, minimum dose and a standardised reading/reporting algorithm, together with a clear definition of a "positive screen".
-
-                                                       * Strict algorithms define the exact management of screen-detected nodules and incidental findings.
-
-                                                       * For LCS to be (cost-)effective, it has to target a population defined by risk prediction models.},
+  
+                                                         Key Points
+  
+                                                         * Pulmonologists and radiologists both have key roles in the set up of multidisciplinary LCS teams with experts from many other fields.
+  
+                                                         * Pulmonologists identify people eligible for LCS, reach out to family doctors, share the decision-making process and promote tobacco cessation.
+  
+                                                         * Radiologists ensure appropriate image quality, minimum dose and a standardised reading/reporting algorithm, together with a clear definition of a "positive screen".
+  
+                                                         * Strict algorithms define the exact management of screen-detected nodules and incidental findings.
+  
+                                                         * For LCS to be (cost-)effective, it has to target a population defined by risk prediction models.},
   file = {Kauc20.pdf:pdf\\Kauc20.pdf:PDF},
   optnote = {DIAG, INPRESS, RADIOLOGY},
   pmid = {32052170},
@@ -15749,22 +15776,22 @@ @article{Kemp21
   url = {http://dx.doi.org/10.1007/s00330-021-08035-0},
   volume = {31},
   abstract = {Abstract
-                                         Objectives
-                                         Different machine learning algorithms (MLAs) for automated segmentation of gliomas have been reported in the literature. Automated segmentation of different tumor characteristics can be of added value for the diagnostic work-up and treatment planning. The purpose of this study was to provide an overview and meta-analysis of different MLA methods.
-
-                                         Methods
-                                         A systematic literature review and meta-analysis was performed on the eligible studies describing the segmentation of gliomas. Meta-analysis of the performance was conducted on the reported dice similarity coefficient (DSC) score of both the aggregated results as two subgroups (i.e., high-grade and low-grade gliomas). This study was registered in PROSPERO prior to initiation (CRD42020191033).
-
-                                         Results
-                                         After the literature search (n = 734), 42 studies were included in the systematic literature review. Ten studies were eligible for inclusion in the meta-analysis. Overall, the MLAs from the included studies showed an overall DSC score of 0.84 (95% CI: 0.82-0.86). In addition, a DSC score of 0.83 (95% CI: 0.80-0.87) and 0.82 (95% CI: 0.78-0.87) was observed for the automated glioma segmentation of the high-grade and low-grade gliomas, respectively. However, heterogeneity was considerably high between included studies, and publication bias was observed.
-
-                                         Conclusion
-                                         MLAs facilitating automated segmentation of gliomas show good accuracy, which is promising for future implementation in neuroradiology. However, before actual implementation, a few hurdles are yet to be overcome. It is crucial that quality guidelines are followed when reporting on MLAs, which includes validation on an external test set.
-
-                                         Key Points
-                                         * MLAs from the included studies showed an overall DSC score of 0.84 (95% CI: 0.82-0.86), indicating a good performance.
-                                         * MLA performance was comparable when comparing the segmentation results of the high-grade gliomas and the low-grade gliomas.
-                                         * For future studies using MLAs, it is crucial that quality guidelines are followed when reporting on MLAs, which includes validation on an external test set.},
+                                           Objectives
+                                           Different machine learning algorithms (MLAs) for automated segmentation of gliomas have been reported in the literature. Automated segmentation of different tumor characteristics can be of added value for the diagnostic work-up and treatment planning. The purpose of this study was to provide an overview and meta-analysis of different MLA methods.
+  
+                                           Methods
+                                           A systematic literature review and meta-analysis was performed on the eligible studies describing the segmentation of gliomas. Meta-analysis of the performance was conducted on the reported dice similarity coefficient (DSC) score of both the aggregated results as two subgroups (i.e., high-grade and low-grade gliomas). This study was registered in PROSPERO prior to initiation (CRD42020191033).
+  
+                                           Results
+                                           After the literature search (n = 734), 42 studies were included in the systematic literature review. Ten studies were eligible for inclusion in the meta-analysis. Overall, the MLAs from the included studies showed an overall DSC score of 0.84 (95% CI: 0.82-0.86). In addition, a DSC score of 0.83 (95% CI: 0.80-0.87) and 0.82 (95% CI: 0.78-0.87) was observed for the automated glioma segmentation of the high-grade and low-grade gliomas, respectively. However, heterogeneity was considerably high between included studies, and publication bias was observed.
+  
+                                           Conclusion
+                                           MLAs facilitating automated segmentation of gliomas show good accuracy, which is promising for future implementation in neuroradiology. However, before actual implementation, a few hurdles are yet to be overcome. It is crucial that quality guidelines are followed when reporting on MLAs, which includes validation on an external test set.
+  
+                                           Key Points
+                                           * MLAs from the included studies showed an overall DSC score of 0.84 (95% CI: 0.82-0.86), indicating a good performance.
+                                           * MLA performance was comparable when comparing the segmentation results of the high-grade gliomas and the low-grade gliomas.
+                                           * For future studies using MLAs, it is crucial that quality guidelines are followed when reporting on MLAs, which includes validation on an external test set.},
   all_ss_ids = {[9b24ef9a54a77629fcd8ad2dab9b668917cd96e4]},
   automatic = {yes},
   citation-count = {25},
@@ -15784,22 +15811,22 @@ @article{Kers21
   url = {http://dx.doi.org/10.1007/s00330-021-08217-w},
   volume = {32},
   abstract = {Abstract
-                                         Objectives
-                                         To evaluate if artificial intelligence (AI) can discriminate recalled benign from recalled malignant mammographic screening abnormalities to improve screening performance.
-
-                                         Methods
-                                         A total of 2257 full-field digital mammography screening examinations, obtained 2011-2013, of women aged 50-69 years which were recalled for further assessment of 295 malignant out of 305 truly malignant lesions and 2289 benign lesions after independent double-reading with arbitration, were included in this retrospective study. A deep learning AI system was used to obtain a score (0-95) for each recalled lesion, representing the likelihood of breast cancer. The sensitivity on the lesion level and the proportion of women without false-positive ratings (non-FPR) resulting under AI were estimated as a function of the classification cutoff and compared to that of human readers.
-
-                                         Results
-                                         Using a cutoff of 1, AI decreased the proportion of women with false-positives from 89.9 to 62.0%, non-FPR 11.1% vs. 38.0% (difference 26.9%, 95% confidence interval 25.1-28.8%; p &lt; .001), preventing 30.1% of reader-induced false-positive recalls, while reducing sensitivity from 96.7 to 91.1% (5.6%, 3.1-8.0%) as compared to human reading. The positive predictive value of recall (PPV-1) increased from 12.8 to 16.5% (3.7%, 3.5-4.0%). In women with mass-related lesions (n = 900), the non-FPR was 14.2% for humans vs. 36.7% for AI (22.4%, 19.8-25.3%) at a sensitivity of 98.5% vs. 97.1% (1.5%, 0-3.5%).
-
-                                         Conclusion
-                                         The application of AI during consensus conference might especially help readers to reduce false-positive recalls of masses at the expense of a small sensitivity reduction. Prospective studies are needed to further evaluate the screening benefit of AI in practice.
-
-                                         Key Points
-                                         * Integrating the use of artificial intelligence in the arbitration process reduces benign recalls and increases the positive predictive value of recall at the expense of some sensitivity loss.
-                                         * Application of the artificial intelligence system to aid the decision to recall a woman seems particularly beneficial for masses, where the system reaches comparable sensitivity to that of the readers, but with considerably reduced false-positives.
-                                         * About one-fourth of all recalled malignant lesions are not automatically marked by the system such that their evaluation (AI score) must be retrieved manually by the reader. A thorough reading of screening mammograms by readers to identify suspicious lesions therefore remains mandatory.},
+                                           Objectives
+                                           To evaluate if artificial intelligence (AI) can discriminate recalled benign from recalled malignant mammographic screening abnormalities to improve screening performance.
+  
+                                           Methods
+                                           A total of 2257 full-field digital mammography screening examinations, obtained 2011-2013, of women aged 50-69 years which were recalled for further assessment of 295 malignant out of 305 truly malignant lesions and 2289 benign lesions after independent double-reading with arbitration, were included in this retrospective study. A deep learning AI system was used to obtain a score (0-95) for each recalled lesion, representing the likelihood of breast cancer. The sensitivity on the lesion level and the proportion of women without false-positive ratings (non-FPR) resulting under AI were estimated as a function of the classification cutoff and compared to that of human readers.
+  
+                                           Results
+                                           Using a cutoff of 1, AI decreased the proportion of women with false-positives from 89.9 to 62.0%, non-FPR 11.1% vs. 38.0% (difference 26.9%, 95% confidence interval 25.1-28.8%; p &lt; .001), preventing 30.1% of reader-induced false-positive recalls, while reducing sensitivity from 96.7 to 91.1% (5.6%, 3.1-8.0%) as compared to human reading. The positive predictive value of recall (PPV-1) increased from 12.8 to 16.5% (3.7%, 3.5-4.0%). In women with mass-related lesions (n = 900), the non-FPR was 14.2% for humans vs. 36.7% for AI (22.4%, 19.8-25.3%) at a sensitivity of 98.5% vs. 97.1% (1.5%, 0-3.5%).
+  
+                                           Conclusion
+                                           The application of AI during consensus conference might especially help readers to reduce false-positive recalls of masses at the expense of a small sensitivity reduction. Prospective studies are needed to further evaluate the screening benefit of AI in practice.
+  
+                                           Key Points
+                                           * Integrating the use of artificial intelligence in the arbitration process reduces benign recalls and increases the positive predictive value of recall at the expense of some sensitivity loss.
+                                           * Application of the artificial intelligence system to aid the decision to recall a woman seems particularly beneficial for masses, where the system reaches comparable sensitivity to that of the readers, but with considerably reduced false-positives.
+                                           * About one-fourth of all recalled malignant lesions are not automatically marked by the system such that their evaluation (AI score) must be retrieved manually by the reader. A thorough reading of screening mammograms by readers to identify suspicious lesions therefore remains mandatory.},
   all_ss_ids = {[7c6d27bbd6a0933a1f3ec40096a17b6a93ee3fa1]},
   automatic = {yes},
   citation-count = {7},
@@ -15867,8 +15894,8 @@ @article{Khor24
   doi = {10.1158/1557-3265.bladder24-b004},
   year = {2024},
   abstract = {Abstract
-                      Introduction: Transurethral resection followed by intravesical bacillus Calmette-Guerin (BCG) is the standard of care for high-risk non-muscle-invasive bladder cancer (HR-NMIBC) patients. However, 20% of HR-NMIBC patients progress to muscle-invasive or metastatic disease, which is associated with a high-risk of death. Early identification of patients who are unlikely to benefit from BCG treatment could benefit from an early radical cystectomy or bladder-sparing alternatives. Previously, we showed that molecular subtyping or HR-NMIBC identified BCG response subtypes (BRS 1/2/3). BRS3 tumors were associated with a poor response to BCG and a high-risk of progression. However, molecular subtyping relies on resource-intensive and costly sequencing methods such as next-generation sequencing, and microarray analysis. We employed deep learning (DL) to predict BRS3 vs BRS1/2 molecular subtypes from digitized histology slides. Methods: Haematoxylin and eosin (H&amp;E)-stained slides were digitized, and quality control was performed. The areas where the RNA was extracted were annotated pixel-wise by a trained pathology assistant, using QuPath. Image tiles of 512 $\times$ 512 pixels at 10X, 20X, and 40X magnifications were extracted with a 25% overlap. For classification, BRS1 and BRS2 images were grouped, while BRS3 images were categorized separately. After applying Macenko normalization, tiles were split based on individual patients into a 75/25 training/validation ratio via stratified 4-fold cross-validation. Four DL models (DenseNet, Inception, ShuffleNet, ResNet) were trained to classify tiles into BRS3 and BRS1/2, with tile labels based on the patient's subtype. Patient-level classification was performed using majority voting from tile predictions. Model efficacy was gauged using the area under the ROC curve (AUC). Results: Out of 245 H&amp;E slides, 45 were discarded after quality control. From 200 patients - with a 70:30 distribution between BRS1&amp;2 and BRS3 The DenseNet model at 10x magnification was the best-performing model, achieving the highest AUC compared to all other models and magnifications. On the image tile level, the best DenseNet achieved an average AUC of 0.65 with a standard deviation of 0.03 across the four folds of the validation set. At the patient-level prediction, the best DenseNet reached an average AUC of 0.80 and a standard deviation of 0.05 on the validation set. Conclusion: Our study demonstrates that deep learning classifies BSR3 from BSR1 and 2 subtypes from H&amp;E slides. This method holds potential to identify BRS3 HR-NMIBC patients who may benefit from alternative treatments than BCG. With further refinement, this deep learning strategy can become a key tool in histology, aiding in swift, affordable, and efficient subtype identification. Hence, this method has the potential to bridge the gap between molecular techniques and clinical practice. Future research should prioritize evaluating the model on an external test cohort and consider predicting additional molecular markers to fine-tune prediction outcomes.
-                      Citation Format: Farbod Khoraminia, Flouros C de Jong, Farhan Akram, Geert Litjens, Maarten D.J. Jansen, Alberto Nakuama Gonzalez, Danique Lichtenburg, Andrew Stubbs, Nadieh Khalili, Tahlita C.M. Zuiverloon. Deep learning unveils molecular footprints in histology: predicting molecular subtypes from bladder cancer histology slides [abstract]. In: Proceedings of the AACR Special Conference on Bladder Cancer: Transforming the Field; 2024 May 17-20; Charlotte, NC. Philadelphia (PA): AACR; Clin Cancer Res 2024;30(10_Suppl):Abstract nr B004.},
+                        Introduction: Transurethral resection followed by intravesical bacillus Calmette-Guerin (BCG) is the standard of care for high-risk non-muscle-invasive bladder cancer (HR-NMIBC) patients. However, 20% of HR-NMIBC patients progress to muscle-invasive or metastatic disease, which is associated with a high-risk of death. Early identification of patients who are unlikely to benefit from BCG treatment could benefit from an early radical cystectomy or bladder-sparing alternatives. Previously, we showed that molecular subtyping or HR-NMIBC identified BCG response subtypes (BRS 1/2/3). BRS3 tumors were associated with a poor response to BCG and a high-risk of progression. However, molecular subtyping relies on resource-intensive and costly sequencing methods such as next-generation sequencing, and microarray analysis. We employed deep learning (DL) to predict BRS3 vs BRS1/2 molecular subtypes from digitized histology slides. Methods: Haematoxylin and eosin (H&amp;E)-stained slides were digitized, and quality control was performed. The areas where the RNA was extracted were annotated pixel-wise by a trained pathology assistant, using QuPath. Image tiles of 512 $\times$ 512 pixels at 10X, 20X, and 40X magnifications were extracted with a 25% overlap. For classification, BRS1 and BRS2 images were grouped, while BRS3 images were categorized separately. After applying Macenko normalization, tiles were split based on individual patients into a 75/25 training/validation ratio via stratified 4-fold cross-validation. Four DL models (DenseNet, Inception, ShuffleNet, ResNet) were trained to classify tiles into BRS3 and BRS1/2, with tile labels based on the patient's subtype. Patient-level classification was performed using majority voting from tile predictions. Model efficacy was gauged using the area under the ROC curve (AUC). Results: Out of 245 H&amp;E slides, 45 were discarded after quality control. From 200 patients - with a 70:30 distribution between BRS1&amp;2 and BRS3 The DenseNet model at 10x magnification was the best-performing model, achieving the highest AUC compared to all other models and magnifications. On the image tile level, the best DenseNet achieved an average AUC of 0.65 with a standard deviation of 0.03 across the four folds of the validation set. At the patient-level prediction, the best DenseNet reached an average AUC of 0.80 and a standard deviation of 0.05 on the validation set. Conclusion: Our study demonstrates that deep learning classifies BSR3 from BSR1 and 2 subtypes from H&amp;E slides. This method holds potential to identify BRS3 HR-NMIBC patients who may benefit from alternative treatments than BCG. With further refinement, this deep learning strategy can become a key tool in histology, aiding in swift, affordable, and efficient subtype identification. Hence, this method has the potential to bridge the gap between molecular techniques and clinical practice. Future research should prioritize evaluating the model on an external test cohort and consider predicting additional molecular markers to fine-tune prediction outcomes.
+                        Citation Format: Farbod Khoraminia, Flouros C de Jong, Farhan Akram, Geert Litjens, Maarten D.J. Jansen, Alberto Nakuama Gonzalez, Danique Lichtenburg, Andrew Stubbs, Nadieh Khalili, Tahlita C.M. Zuiverloon. Deep learning unveils molecular footprints in histology: predicting molecular subtypes from bladder cancer histology slides [abstract]. In: Proceedings of the AACR Special Conference on Bladder Cancer: Transforming the Field; 2024 May 17-20; Charlotte, NC. Philadelphia (PA): AACR; Clin Cancer Res 2024;30(10_Suppl):Abstract nr B004.},
   url = {http://dx.doi.org/10.1158/1557-3265.bladder24-b004},
   file = {Khor24.pdf:pdf\\Khor24.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -16113,7 +16140,7 @@ @phdthesis{Kock17a
   year = {2017},
   url = {https://dspace.library.uu.nl/handle/1874/343111},
   abstract = {This thesis describes and evaluates an interactive annotation system for chest CT scans. As a first step, the structure of interest is segmented automatically. This structure is then divided into smaller volumes of interest (VOIs) containing one type of texture. These VOIs
-                                                        are automatically labeled, either by a classifier or using a heuristic approach.},
+                                                          are automatically labeled, either by a classifier or using a heuristic approach.},
   file = {Kock17a.pdf:pdf/Kock17a.pdf:PDF},
   optnote = {DIAG},
   promotor = {M. A. Viergever, B. van Ginneken and W. M. Prokop},
@@ -16274,15 +16301,15 @@ @inproceedings{Kooi17b
   pages = {101341J},
   doi = {10.1117/12.2254586},
   abstract = {When humans identify objects in images, context is an important cue; a cheetah is more likely to be a domestic cat
-                                                       when a television set is recognised in the background. Similar principles apply to the analysis of medical images.
-                                                       The detection of diseases that manifest unilaterally in symmetrical organs or organ pairs can in part be facilitated
-                                                       by a search for symmetrical discrepancies in or between the organs in question. During a mammographic exam,
-                                                       images are recorded of each breast and absence of a certain structure around the same location in the contra-
-                                                       lateral image will render the area under scrutiny more suspicious and conversely, the presence of similar tissue
-                                                       less so. In this paper, we present a fusion scheme for a deep Convolutional Neural Network (CNN) architecture
-                                                       with the goal to optimally capture such asymmetries. The method is applied to the domain of mammography
-                                                       CAD, but can be relevant to other medical image analysis tasks where symmetry is important such as lung,
-                                                       prostate or brain images.},
+                                                         when a television set is recognised in the background. Similar principles apply to the analysis of medical images.
+                                                         The detection of diseases that manifest unilaterally in symmetrical organs or organ pairs can in part be facilitated
+                                                         by a search for symmetrical discrepancies in or between the organs in question. During a mammographic exam,
+                                                         images are recorded of each breast and absence of a certain structure around the same location in the contra-
+                                                         lateral image will render the area under scrutiny more suspicious and conversely, the presence of similar tissue
+                                                         less so. In this paper, we present a fusion scheme for a deep Convolutional Neural Network (CNN) architecture
+                                                         with the goal to optimally capture such asymmetries. The method is applied to the domain of mammography
+                                                         CAD, but can be relevant to other medical image analysis tasks where symmetry is important such as lung,
+                                                         prostate or brain images.},
   file = {Kooi17b.pdf:pdf\\Kooi17b.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   month = {3},
@@ -16323,8 +16350,8 @@ @article{Kooi17d
   pages = {International Society for Optics and Photonics},
   doi = {10.1117/1.JMI.4.4.044501},
   abstract = {Neural networks, in particular deep Convolutional Neural Networks (CNN), have recently gone through a renaissance sparked by the introduction of more efficient training procedures and massive amounts of raw annotated data. Barring a handful of modalities, medical images are typically too large to present as input as a whole and models are consequently trained with subsets of images or cases, representing the most crucial bits of information. When inspecting a scene to identify objects, humans take cues from not just the article in question but also the elements in its vicinity: a frisbee is more likely to be a plate in the presence of a fork and knife. Similar principles apply to the analysis of medical images: specialists base their judgment of an abnormality on all available data, harnessing information such as symmetrical differences in or between organs in question and temporal change, if multiple recordings are available. \\
-
-                                                        In this paper we investigate the addition of symmetry and temporal context information to a deep CNN with the purpose of detecting malignant soft tissue lesions in mammography. We employ a simple linear mapping that takes the location of a mass candidate and maps it to either the contra-lateral or prior mammogram and Regions Of Interest (ROI) are extracted around each location. We subsequently explore two different architectures (1) a fusion model employing two datastreams were both ROIs are fed to the network during training and testing and (2) a stage-wise approach where a single ROI CNN is trained on the primary image and subsequently used as feature extractor for both primary and symmetrical or prior ROIs. A 'shallow' Gradient Boosted Tree (GBT) classifier is then trained on the concatenation of these features and used to classify the joint representation. Results shown a significant increase in performance using the first architecture and symmetry information, but only marginal gains in performance using temporal data and the other setting. We feel results are promising and can greatly be improved when more temporal data becomes available.},
+  
+                                                          In this paper we investigate the addition of symmetry and temporal context information to a deep CNN with the purpose of detecting malignant soft tissue lesions in mammography. We employ a simple linear mapping that takes the location of a mass candidate and maps it to either the contra-lateral or prior mammogram and Regions Of Interest (ROI) are extracted around each location. We subsequently explore two different architectures (1) a fusion model employing two datastreams were both ROIs are fed to the network during training and testing and (2) a stage-wise approach where a single ROI CNN is trained on the primary image and subsequently used as feature extractor for both primary and symmetrical or prior ROIs. A 'shallow' Gradient Boosted Tree (GBT) classifier is then trained on the concatenation of these features and used to classify the joint representation. Results shown a significant increase in performance using the first architecture and symmetry information, but only marginal gains in performance using temporal data and the other setting. We feel results are promising and can greatly be improved when more temporal data becomes available.},
   file = {Kooi17d.pdf:pdf\\Kooi17d.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {29021992},
@@ -16398,7 +16425,7 @@ @conference{Koop08a
   booktitle = ISMRM,
   year = {2008},
   abstract = {Introduction: MR venography based upon susceptibility weighted imaging (SWI) can depict the venous vascular tree with excellent detail by using minimum intensity projections (mIP) over several slices [1]. While this visualisation works quite nicely, this approach does not classify veins, which can be necessary, e.g. to remove veins from high resolution functional MRI scans. To solve this it has been proposed to use filters known from angiography [2]. In this study we assessed the performance of two such filters: the Utrecht vesselness filter and Vessel Enhancing Diffusion (VED) in regions of high and low SNR. As higher field strengths are beneficial for SWI venography the VED filter was used on SWI data acquired at 7T and compared with 3T data. Methods:All experiments were performed on whole body MRI scanners (Siemens, Erlangen, Germany) at field strengths of 3T and 7T. SWI images were acquired using a firstorder flow-compensated 3D gradient echo FLASH sequence with the following parameters: FA 15A-A?A 1/2 , BW 120 Hz/pixel, acceleration factor 2 (GRAPPA). At 3T we used an echo time of 28 ms with a TR of 35 ms, at 7T these were 15 ms and 22 ms respectively. Whole brain sagittal acquisitions were performed on the same subject at both fields using a matrix size of 352x448x144 and a resolution of 0.57x0.57x1.25 mm3. Acquisition times were 15 and 10 minutes at 3T and 7T respectively. To test the filters in regions of high and low SNR a dataset was acquired using an eight channel occipital surface coil array [3]. In order to increase SNR an average over 4 repetitions was used. Matrix size was 144x192x40 using an isotropic resolution of 0.75 mm. Acquisition time was 2 minutes per volume. All datasets were intensityinhomogeneity corrected and veins were segmented using the Utrecht vesselness filter and VED which are described in detail in [4] and [5] respectively. In short, the Utrecht filter uses second order image information to distinguish dark tube-like structures (veins) from their surroundings and noise. The VED filter uses an iterative approach by alternately applying the Utrecht filter and a diffusion process to eliminate noise. However the shape/direction of the diffusion is dependent on the vessel likeliness found by the previous iteration of the Utrecht filter. For voxels that do not belong to the venous tree, the diffusion is isotropic, for venous voxels diffusion is applied in the direction of the vessel. This diffusion process improves the starting conditions for the next Utrecht filter step. In this study 5 iterations were used. In addition to the automated filters, manual segmentation was performed on the 0.75 mm resolution dataset to be compared with the VED result. Results: Fig. 1a shows a mIP over 11 slices of the dataset acquired using the occipital coil. Fig. 1b shows the maximum intensity projection (MIP) of the output of the Utrecht filter, fig. 1c shows the MIP of the VED result. In the bottom part of the images, where SNR is relatively high, both filters perform very well. In the top part, where SNR is reduced due to the inhomogeneous coil profile, the VED method is better able to contrast veins with respect to the noisy background than the Utrecht filter. Fig. 2 shows the manual segmentation (a) and the VED result (b) overlaid on the mIP of the data. Apart from the very top part of the image (where SNR was extremely low) both results are in excellent agreement. Conclusion: Two automated vein segmentation filters have been applied to SWI-venography data. In regions of high SNR the Utrecht vesselness filter and the vessel enhancing diffusion method both performed very well. In regions of low SNR the VED method outperformed the Utrecht filter and the result was in excellent agreement with manual segmentation results. The VED filter was applied to whole brain 3T and 7T data where A-A?A 1/2  due to the huge amount of visible veins - manual segmentation can no longer be considered a possibility. Although the results at 3T were slightly better than at 7T due to increased SNR inhomogeneity at the latter
-                                                       field strength, 7T allows for a much shorter acquisition time and therefore shows good promise for future SWI-venography.},
+                                                         field strength, 7T allows for a much shorter acquisition time and therefore shows good promise for future SWI-venography.},
   file = {Koop08a.pdf:pdf\\Koop08a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
 }
@@ -16503,9 +16530,9 @@ @conference{Kosc18
   booktitle = RSNA,
   year = {2018},
   abstract = {PURPOSE: Brain extraction methods in {MRI} have so far been exclusively developed for T1- and T2-weighted images. A deep neural network is presented to segment the brain tissue in susceptibility-weighted images ({SWI}) in healthy individuals and patients with traumatic brain injury ({TBI}).
-                                                       MATERIALS AND METHODS: In total, {MRI} scans from 33 patients with moderate to severe {TBI} and 18 healthy controls were collected. {SWI}s were acquired with 27ms {TR}, 20ms {TE}, 15deg flip angle, and0.98x0.98x1.00mm3 voxel size on a {3T} Siemens MRI scanner. A small scale 2D-U-Net was implemented (18 convolution layers, max. 256 features per layer) processing a volume in axial direction. The U-Net architecture allowed the model to utilize both local and contextual information. The output probability maps were thresholded and possible outliers were removed by taking the largest connected component. 20 {TBI} patients and 10 controls served as a test set, the remaining patients were used for training. The reference standard were brain masks obtained with {SPM}, a publicly available software package commonly used for brain extractions in {MR} neuroimaging, but not optimized for {SWI} sequences. These annotations were visually inspected. The results of the deep learning method were visually inspected for completeness and overall quality. Dice similarity coefficient ({DCS}) and the modified Hausdorff ({MHD}) distance were reported for the test set.
-                                                       RESULTS: The {DCS} was 0.98+-0.002 per volume at the chosen operating point on the SPM standard and the {MHD} was 0.93+-0.11mm per volume. It took less than 10 seconds to compute the complete 3D brain mask on a modern {GPU}. Overall, our method was capable of learning from a sub-optimal reference standard and extracting the brain in an {SWI} image. It mimicked some of the deficiencies of the {SPM} brain masks, such as occasional failures in the most inferior or superior axial slices, but also mitigated others through generalization over the training set. Holes in the mask caused by contusions or hematomas were less prevalent with the 2D-U-Net than with {SPM}.
-                                                       CONCLUSION: The 2D-U-Net method provides fast brain extractions in {MR-SWI}.},
+                                                         MATERIALS AND METHODS: In total, {MRI} scans from 33 patients with moderate to severe {TBI} and 18 healthy controls were collected. {SWI}s were acquired with 27ms {TR}, 20ms {TE}, 15deg flip angle, and0.98x0.98x1.00mm3 voxel size on a {3T} Siemens MRI scanner. A small scale 2D-U-Net was implemented (18 convolution layers, max. 256 features per layer) processing a volume in axial direction. The U-Net architecture allowed the model to utilize both local and contextual information. The output probability maps were thresholded and possible outliers were removed by taking the largest connected component. 20 {TBI} patients and 10 controls served as a test set, the remaining patients were used for training. The reference standard were brain masks obtained with {SPM}, a publicly available software package commonly used for brain extractions in {MR} neuroimaging, but not optimized for {SWI} sequences. These annotations were visually inspected. The results of the deep learning method were visually inspected for completeness and overall quality. Dice similarity coefficient ({DCS}) and the modified Hausdorff ({MHD}) distance were reported for the test set.
+                                                         RESULTS: The {DCS} was 0.98+-0.002 per volume at the chosen operating point on the SPM standard and the {MHD} was 0.93+-0.11mm per volume. It took less than 10 seconds to compute the complete 3D brain mask on a modern {GPU}. Overall, our method was capable of learning from a sub-optimal reference standard and extracting the brain in an {SWI} image. It mimicked some of the deficiencies of the {SPM} brain masks, such as occasional failures in the most inferior or superior axial slices, but also mitigated others through generalization over the training set. Holes in the mask caused by contusions or hematomas were less prevalent with the 2D-U-Net than with {SPM}.
+                                                         CONCLUSION: The 2D-U-Net method provides fast brain extractions in {MR-SWI}.},
   file = {Kosc18.pdf:pdf\\Kosc18.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
 }
@@ -16912,22 +16939,6 @@ @inproceedings{Laue13
   gscites = {4},
 }
 
-@article{Laur22,
-  author = {Lauritzen, Andreas D. and von Euler-Chelpin, My Catarina and Lynge, Elsebeth and Vejborg, Ilse and Nielsen, Mads and Karssemeijer, Nico and Lillholm, Martin},
-  title = {Robust Cross-vendor Mammographic Texture Models Using Augmentation-based Domain Adaptation for Long-term Breast Cancer Risk},
-  doi = {10.48550/ARXIV.2212.13439},
-  year = {2022},
-  abstract = {Purpose: Risk-stratified breast cancer screening might improve early detection and efficiency without comprising quality. However, modern mammography-based risk models do not ensure adaptation across vendor-domains and rely on cancer precursors, associated with short-term risk, which might limit long-term risk assessment. We report a cross-vendor mammographic texture model for long-term risk. Approach: The texture model was robustly trained using two systematically designed case-control datasets. Textural features, indicative of future breast cancer, were learned by excluding samples with diagnosed/potential malignancies from training. An augmentation-based domain adaption technique, based on flavorization of mammographic views, ensured generalization across vendor-domains. The model was validated in 66,607 consecutively screened Danish women with flavorized Siemens views and 25,706 Dutch women with Hologic-processed views. Performances were evaluated for interval cancers (IC) within two years from screening and long-term cancers (LTC) from two years after screening. The texture model was combined with established risk factors to flag 10% of women with the highest risk. Results: In Danish women, the texture model achieved an area under the receiver operating characteristic (AUC) of 0.71 and 0.65 for ICs and LTCs, respectively. In Dutch women with Hologic-processed views, the AUCs were not different from AUCs in Danish women with flavorized views. The AUC for texture combined with established risk factors increased to 0.68 for LTCs. The 10% of women flagged as high-risk accounted for 25.5% of ICs and 24.8% of LTCs. Conclusions: The texture model robustly estimated long-term breast cancer risk while adapting to an unseen processed vendor-domain and identified a clinically relevant high-risk subgroup.},
-  url = {https://arxiv.org/abs/2212.13439},
-  file = {Laur22.pdf:pdf\\Laur22.pdf:PDF},
-  optnote = {DIAG, RADIOLOGY},
-  journal = {arXiv:2212.13439},
-  automatic = {yes},
-  all_ss_ids = {[98cec4020ccc1ef0399b4f866544a30fb550d34c]},
-  pmid = {37780685},
-  gscites = {0},
-}
-
 @article{Laur22a,
   author = {Lauritzen, Andreas D. and Rodr\'{i}guez-Ruiz, Alejandro and von Euler-Chelpin, My Catarina and Lynge, Elsebeth and Vejborg, Ilse and Nielsen, Mads and Karssemeijer, Nico and Lillholm, Martin},
   title = {An Artificial Intelligence-based Mammography Screening                     Protocol for Breast Cancer: Outcome and Radiologist Workload},
@@ -16947,6 +16958,23 @@ @article{Laur22a
   gscites = {46},
 }
 
+@article{Laur23,
+  author = {Lauritzen, Andreas D. and von Euler-Chelpin, My Catarina and Lynge, Elsebeth and Vejborg, Ilse and Nielsen, Mads and Karssemeijer, Nico and Lillholm, Martin},
+  title = {Robust cross-vendor mammographic texture models using augmentation-based domain adaptation for long-term breast cancer risk},
+  doi = {10.1117/1.jmi.10.5.054003},
+  year = {2023},
+  abstract = {PurposeRisk-stratified breast cancer screening might improve early detection and efficiency without comprising quality. However, modern mammography-based risk models do not ensure adaptation across vendor-domains and rely on cancer precursors, associated with short-term risk, which might limit long-term risk assessment. We report a cross-vendor mammographic texture model for long-term risk.ApproachThe texture model was robustly trained using two systematically designed case-control datasets. Textural features, indicative of future breast cancer, were learned by excluding samples with diagnosed/potential malignancies from training. An augmentation-based domain adaption technique, based on flavorization of mammographic views, ensured generalization across vendor-domains. The model was validated in 66,607 consecutively screened Danish women with flavorized Siemens views and 25,706 Dutch women with Hologic-processed views. Performances were evaluated for interval cancers (IC) within 2 years from screening and long-term cancers (LTC) from 2 years after screening. The texture model was combined with established risk factors to flag 10% of women with the highest risk.ResultsIn Danish women, the texture model achieved an area under the receiver operating characteristic curve (AUC) of 0.71 and 0.65 for ICs and LTCs, respectively. In Dutch women with Hologic-processed views, the AUCs were not different from AUCs in Danish women with flavorized views. The AUC for texture combined with established risk factors increased to 0.68 for LTCs. The 10% of women flagged as high-risk accounted for 25.5% of ICs and 24.8% of LTCs.ConclusionsThe texture model robustly estimated long-term breast cancer risk while adapting to an unseen processed vendor-domain and identified a clinically relevant high-risk subgroup.},
+  url = {http://dx.doi.org/10.1117/1.jmi.10.5.054003},
+  file = {Laur23.pdf:pdf\\Laur23.pdf:PDF},
+  optnote = {DIAG, RADIOLOGY},
+  journal = {Journal of Medical Imaging},
+  automatic = {yes},
+  all_ss_ids = {['98cec4020ccc1ef0399b4f866544a30fb550d34c']},
+  citation-count = {1},
+  volume = {10},
+  pmid = {37780685},
+}
+
 @article{Leac12,
   author = {Leach, M. O. and Morgan, B. and Tofts, P. S. and Buckley, D. L. and Huang, W. and Horsfield, M. A. and Chenevert, T. L. and Collins, D. J. and Jackson, A. and Lomas, D. and Whitcher, B. and Clarke, L. and Plummer, R. and Judson, I. and Jones, R. and Alonzi, R. and Brunner, T. and Koh, D. M. and Murphy, P. and Waterton, J. C. and Parker, G. and Graves, M. J. and Scheenen, T W J. and Redpath, T. W. and Orton, M. and Karczmar, G. and Huisman, H. and Barentsz, J. and Padhani, A. and , on behalf of the Experimental Cancer Medicine Centres Imaging Network Steering Committee},
   title = {Imaging vascular function for early stage clinical trials using dynamic contrast-enhanced magnetic resonance imaging},
@@ -16988,10 +17016,10 @@ @conference{Leem17
   booktitle = ECR,
   year = {2017},
   abstract = {Purpose: Segmentation of cerebral white matter (WM), gray matter (GM) and cerebrospinal fluid (CSF) in head CT is important for subsequent quantitative analysis and automated detection of cerebral pathology. We introduce VCAST, a new volumetric annotation tool aimed at delineating soft tissue in non-contrast CT (NCCT) and CT perfusion (CTP).
-                                                       Methods and Materials: VCAST supports traditional 2D visualizations and annotations, and provides functionalities to facilitate 3D segmentations based on pre-calculated grids of volumetric clusters where the clusters are spatially coherently grouped based on HUs. Clicking a cluster in a 2D-plane allows for inclusion of the 3D-cluster in the output segmentation.
-                                                       Ten patients with suspicion of ischemic stroke were included in this retrospective study, five NCCTs and five whole brain CTPs (320-row detector scanner). Temporal average CTA was reconstructed from CTP and in one slice in arbitrary direction, WM, GM and CSF were annotated two times by one observer using VCAST. In NCCT, a subvolume of approximately 22 mm^3 was randomly selected in which CSF was annotated by one observer, using VCAST either with 2D (slice-based) or 2D and 3D (cluster-based) annotation support. Dice coefficients and annotation times were reported.
-                                                       Results: Dice coefficients were 0.86A+-0.04, 0.91A+-0.02, 0.87A+-0.02 for CSF, GM and WM respectively. CSF annotation times reduced from 16A+-9 to 8A+-5 minutes with 3D cluster support (p=0.02). CSF Dice similarity was 0.81A+-0.03.
-                                                       Conclusion: VCAST is a volumetric annotation tool which reduces the time to obtain 3D segmentations in head CT while maintaining good overlap with a slice-based approach.},
+                                                         Methods and Materials: VCAST supports traditional 2D visualizations and annotations, and provides functionalities to facilitate 3D segmentations based on pre-calculated grids of volumetric clusters where the clusters are spatially coherently grouped based on HUs. Clicking a cluster in a 2D-plane allows for inclusion of the 3D-cluster in the output segmentation.
+                                                         Ten patients with suspicion of ischemic stroke were included in this retrospective study, five NCCTs and five whole brain CTPs (320-row detector scanner). Temporal average CTA was reconstructed from CTP and in one slice in arbitrary direction, WM, GM and CSF were annotated two times by one observer using VCAST. In NCCT, a subvolume of approximately 22 mm^3 was randomly selected in which CSF was annotated by one observer, using VCAST either with 2D (slice-based) or 2D and 3D (cluster-based) annotation support. Dice coefficients and annotation times were reported.
+                                                         Results: Dice coefficients were 0.86A+-0.04, 0.91A+-0.02, 0.87A+-0.02 for CSF, GM and WM respectively. CSF annotation times reduced from 16A+-9 to 8A+-5 minutes with 3D cluster support (p=0.02). CSF Dice similarity was 0.81A+-0.03.
+                                                         Conclusion: VCAST is a volumetric annotation tool which reduces the time to obtain 3D segmentations in head CT while maintaining good overlap with a slice-based approach.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -17031,18 +17059,18 @@ @inproceedings{Leem18c
   year = {2018},
   url = {https://openreview.net/pdf?id=rJxlLGBElN},
   abstract = {There is a demand for deep learning approaches able to process high resolution 3D
-                                                       volumes in an accurate and fast way. However, training of these models is often
-                                                       limited by the available GPU memory, which often results in reduced model depth,
-                                                       receptive field, and input size, limiting the expressiveness of the model. In this work
-                                                       we present a memory efficient modified convolutional-LSTM, which integrates
-                                                       a context-rich 2D U-Net as an input in a slice based manner and subsequently
-                                                       integrates the acquired slices using LSTM to create the full 3D context. Memory
-                                                       savings achieved by checkpointing on one or more steps within the LSTM allow
-                                                       for direct training on a single full non-contrast CT volume of: 512 x 512 x 320 on
-                                                       a NVIDIA Titan X with 12 GB of VRAM. We demonstrate the effectiveness of our
-                                                       method by training and segmenting the cranial cavity including soft-brain tissue
-                                                       and CSF in the non-contrast CT end-to-end on the full image data, without any
-                                                       stitching, while preserving a large receptive field and high expressiveness.},
+                                                         volumes in an accurate and fast way. However, training of these models is often
+                                                         limited by the available GPU memory, which often results in reduced model depth,
+                                                         receptive field, and input size, limiting the expressiveness of the model. In this work
+                                                         we present a memory efficient modified convolutional-LSTM, which integrates
+                                                         a context-rich 2D U-Net as an input in a slice based manner and subsequently
+                                                         integrates the acquired slices using LSTM to create the full 3D context. Memory
+                                                         savings achieved by checkpointing on one or more steps within the LSTM allow
+                                                         for direct training on a single full non-contrast CT volume of: 512 x 512 x 320 on
+                                                         a NVIDIA Titan X with 12 GB of VRAM. We demonstrate the effectiveness of our
+                                                         method by training and segmenting the cranial cavity including soft-brain tissue
+                                                         and CSF in the non-contrast CT end-to-end on the full image data, without any
+                                                         stitching, while preserving a large receptive field and high expressiveness.},
   file = {:pdf/Leem18c.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   gsid = {2197973819111672877},
@@ -17080,8 +17108,8 @@ @article{Leem19b
   doi = {10.21105/joss.01576},
   code = {https://github.com/silvandeleemput/memcnn},
   abstract = {Neural networks are computational models that were originally inspired by biological neural networks like animal brains. These networks are composed of many small computational units called neurons that perform elementary calculations. Instead of explicitly programming the behavior of neural networks, these models can be trained to perform tasks, like classifying images, by presenting them examples. Sufficiently complex neural networks can automatically extract task-relevant characteristics from the presented examples without having prior knowledge about the task domain, which makes them attractive for many complicated real-world applications.
-
-                                                       Reversible operations have recently been successfully applied to classification problems to reduce memory requirements during neural network training. This feature is accomplished by removing the need to store the input activation for computing the gradients at the backward pass and instead reconstruct them on demand. However, current approaches rely on custom implementations of backpropagation, which limits applicability and extendibility. We present MemCNN, a novel PyTorch framework that simplifies the application of reversible functions by removing the need for a customized backpropagation. The framework contains a set of practical generalized tools, which can wrap common operations like convolutions and batch normalization and which take care of memory management. We validate the presented framework by reproducing state-of-the-art experiments using MemCNN and by comparing classification accuracy and training time on Cifar-10 and Cifar-100. Our MemCNN implementations achieved similar classification accuracy and faster training times while retaining compatibility with the default backpropagation facilities of PyTorch.},
+  
+                                                         Reversible operations have recently been successfully applied to classification problems to reduce memory requirements during neural network training. This feature is accomplished by removing the need to store the input activation for computing the gradients at the backward pass and instead reconstruct them on demand. However, current approaches rely on custom implementations of backpropagation, which limits applicability and extendibility. We present MemCNN, a novel PyTorch framework that simplifies the application of reversible functions by removing the need for a customized backpropagation. The framework contains a set of practical generalized tools, which can wrap common operations like convolutions and batch normalization and which take care of memory management. We validate the presented framework by reproducing state-of-the-art experiments using MemCNN and by comparing classification accuracy and training time on Cifar-10 and Cifar-100. Our MemCNN implementations achieved similar classification accuracy and faster training times while retaining compatibility with the default backpropagation facilities of PyTorch.},
   file = {:pdf/Leem19b.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   publisher = {The Open Journal},
@@ -17143,10 +17171,10 @@ @conference{Leeu20
   author = {van Leeuwen, Kicky G. and Schalekamp, Steven and Rutten, Matthieu J.C.M. and van Ginneken, Bram and de Rooij, Maarten},
   title = {Scientific Evidence for 100 Commercially Available Artificial Intelligence Tools for Radiology: A Systematic Review},
   abstract = {Purpose: To survey scientific evidence for all CE marked artificial intelligence (AI) based software products for radiology available as of April, 2020.
-                            Materials and Methods: We created an online overview of CE-certified AI software products for clinical radiology based on vendor-supplied product specifications  (www.aiforradiology.com). For these products, we conducted a systematic literature study on Pubmed for original, peer-reviewed, English articles published between Jan 1, 2015 and April 14, 2020 on the efficacy of the AI software. Papers were included when the product and/or company name were mentioned, when efficacy level 2 to 6 according to Fryback was reported on an independent dataset, and when the tool was applied on in vivo human data.
-                            Results: Our product overview consisted of 100 CE-certified software products from 51 different vendors. Among the 839 papers screened, 108 met the inclusion criteria. For 70/100 products we did not find papers that met the inclusion criteria. The evidence of the other 30 products was predominantly (84%) focused on diagnostic accuracy (efficacy level 2). Half of the available evidence (49%) was independent and not (co)-funded or (co)-authored by the vendor. In more than half (55%) of the papers the version number of the product used in the study was not mentioned. From all studies, 20 (18%) used validation data from multiple countries, 42 (39%) were multicenter studies, 25 (23%) were performed with acquisition machines from multiple manufacturers.
-                            Conclusion: One hundred CE-certified AI software products for radiology exist today. Yet, for the majority, scientific evidence on the clinical performance and clinical impact is lacking. These insights should raise awareness that certification may not guarantee technical and clinical efficacy of an AI product.
-                            Clinical relevance: Our findings identify the available evidence for commercially available AI software, aiming to contribute to a safe and effective implementation of AI software in radiology departments.},
+                              Materials and Methods: We created an online overview of CE-certified AI software products for clinical radiology based on vendor-supplied product specifications  (www.aiforradiology.com). For these products, we conducted a systematic literature study on Pubmed for original, peer-reviewed, English articles published between Jan 1, 2015 and April 14, 2020 on the efficacy of the AI software. Papers were included when the product and/or company name were mentioned, when efficacy level 2 to 6 according to Fryback was reported on an independent dataset, and when the tool was applied on in vivo human data.
+                              Results: Our product overview consisted of 100 CE-certified software products from 51 different vendors. Among the 839 papers screened, 108 met the inclusion criteria. For 70/100 products we did not find papers that met the inclusion criteria. The evidence of the other 30 products was predominantly (84%) focused on diagnostic accuracy (efficacy level 2). Half of the available evidence (49%) was independent and not (co)-funded or (co)-authored by the vendor. In more than half (55%) of the papers the version number of the product used in the study was not mentioned. From all studies, 20 (18%) used validation data from multiple countries, 42 (39%) were multicenter studies, 25 (23%) were performed with acquisition machines from multiple manufacturers.
+                              Conclusion: One hundred CE-certified AI software products for radiology exist today. Yet, for the majority, scientific evidence on the clinical performance and clinical impact is lacking. These insights should raise awareness that certification may not guarantee technical and clinical efficacy of an AI product.
+                              Clinical relevance: Our findings identify the available evidence for commercially available AI software, aiming to contribute to a safe and effective implementation of AI software in radiology departments.},
   booktitle = RSNA,
   year = {2020},
   optnote = {DIAG, RADIOLOGY},
@@ -17212,10 +17240,10 @@ @conference{Leeu21c
   booktitle = ECR,
   year = {2021},
   abstract = {Purpose: There are over 150 artificial intelligence (AI) products for radiology offered, but little is known about their current clinical use. We investigated actual clinical use of AI software in radiology departments in the Netherlands.
-                          Materials and Methods: We consulted the radiology department of each hospital organization in the Netherlands (n=70) about their current AI implementations and plans from February-March 2020. A representative of the department was asked to fill in a questionnaire about their knowledge, experience, research and/or clinical use of commercially available CE-certified AI products for radiology (n=93). Responses for these familiarity-levels were analysed to create an overview with quantitative metrics.
-                          Results: The response rate of the consulted hospitals was 43/70: 38/62 for general hospitals, 5/7 for academic medical centers, and 0/1 for children's hospitals. Of the respondents 30 (70%) were radiologists, 5 (12%) application or information managers, and 8 (19%), among others, clinical physicists and managers. A third (14) of the participating organizations had one to three AI applications in clinical use, with a total of 19 implementations. These implementations involved eight different vendors of which four were from the Netherlands. Most commonly used was software for bone age prediction and stroke detection. Respondents were most familiar with products aimed at neurology and cardiology. MR, CT and mammography were the most familiar modalities for AI. Most interest for clinical implementation was shown in software to triage exams. Eleven organizations (26%) had a dedicated budget for AI, either from the hospital or the department.
-                          Conclusion: Even though the supply of AI software is extensive, clinical use remains limited showing that we are still in the initial stages of integrating AI in clinical practice in the Netherlands.
-                          Limitations: Results may be influenced by a nonresponse bias.},
+                            Materials and Methods: We consulted the radiology department of each hospital organization in the Netherlands (n=70) about their current AI implementations and plans from February-March 2020. A representative of the department was asked to fill in a questionnaire about their knowledge, experience, research and/or clinical use of commercially available CE-certified AI products for radiology (n=93). Responses for these familiarity-levels were analysed to create an overview with quantitative metrics.
+                            Results: The response rate of the consulted hospitals was 43/70: 38/62 for general hospitals, 5/7 for academic medical centers, and 0/1 for children's hospitals. Of the respondents 30 (70%) were radiologists, 5 (12%) application or information managers, and 8 (19%), among others, clinical physicists and managers. A third (14) of the participating organizations had one to three AI applications in clinical use, with a total of 19 implementations. These implementations involved eight different vendors of which four were from the Netherlands. Most commonly used was software for bone age prediction and stroke detection. Respondents were most familiar with products aimed at neurology and cardiology. MR, CT and mammography were the most familiar modalities for AI. Most interest for clinical implementation was shown in software to triage exams. Eleven organizations (26%) had a dedicated budget for AI, either from the hospital or the department.
+                            Conclusion: Even though the supply of AI software is extensive, clinical use remains limited showing that we are still in the initial stages of integrating AI in clinical practice in the Netherlands.
+                            Limitations: Results may be influenced by a nonresponse bias.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -17223,10 +17251,10 @@ @conference{Leeu21d
   author = {van Leeuwen, Kicky G and Meijer, Frederick J A and Schalekamp, Steven and Rutten, Matthieu J C M and van Dijk, Ewoud J and van Ginneken, Bram and Govers, Tim M and de Rooij, Maarten},
   title = {Artificial Intelligence in Acute Stroke: an Early Health Technology Assessment of Vessel Occlusion Detection on Computed Tomography},
   abstract = {Purpose or Learning Objective: To demonstrate the cost-effectiveness of artificial intelligence (AI) software to aid in the detection of intracranial vessel occlusions in stroke compared to standard care by performing early health technology assessment.
-                          Methods or Background: We used a Markov based model from a societal perspective in a UK setting to demonstrate the potential value of an AI tool reported in expected incremental costs (IC) and effects (IE) in quality adjusted life years (QALYs). Initial population existed of patients suspected of stroke based on symptoms and exclusion of other causes as demonstrated by non-contrast cerebrum CT. Input parameters for the model were predominantly based on stroke registry data from the UK and complemented with pooled outcome data from large randomized trials. Parameters were varied to demonstrate model robustness.
-                          Results or Findings: The AI strategy with its base-case parameters (6% missed diagnoses of intra-arterial therapy eligible patients by clinicians, $40 per AI analysis, 50% reduction of missed vessel occlusions by AI) resulted in modest cost-savings and incremental QALYs over the projected lifetime (IC: - $156, -0.23%; IE: +0.01 QALYs, +0.07%) per ischaemic stroke patient. Within a ninety-day window after treatment no financial (IC: +$60) and negligible QALY (IE: +0.0001) gain was observed. For each yearly cohort of patients in the UK this translates to a total cost saving of $11 million.
-                          Conclusion: We showed that computer aided thrombus detection in emergency care has the potential to increase health and save costs. Results may contribute to the debate on the investments, financial accountability and reimbursement for the clinical use of AI technology.
-                          Limitations: Parameter values of the model were based on results from previous studies.},
+                            Methods or Background: We used a Markov based model from a societal perspective in a UK setting to demonstrate the potential value of an AI tool reported in expected incremental costs (IC) and effects (IE) in quality adjusted life years (QALYs). Initial population existed of patients suspected of stroke based on symptoms and exclusion of other causes as demonstrated by non-contrast cerebrum CT. Input parameters for the model were predominantly based on stroke registry data from the UK and complemented with pooled outcome data from large randomized trials. Parameters were varied to demonstrate model robustness.
+                            Results or Findings: The AI strategy with its base-case parameters (6% missed diagnoses of intra-arterial therapy eligible patients by clinicians, $40 per AI analysis, 50% reduction of missed vessel occlusions by AI) resulted in modest cost-savings and incremental QALYs over the projected lifetime (IC: - $156, -0.23%; IE: +0.01 QALYs, +0.07%) per ischaemic stroke patient. Within a ninety-day window after treatment no financial (IC: +$60) and negligible QALY (IE: +0.0001) gain was observed. For each yearly cohort of patients in the UK this translates to a total cost saving of $11 million.
+                            Conclusion: We showed that computer aided thrombus detection in emergency care has the potential to increase health and save costs. Results may contribute to the debate on the investments, financial accountability and reimbursement for the clinical use of AI technology.
+                            Limitations: Parameter values of the model were based on results from previous studies.},
   booktitle = ECR,
   year = {2021},
   optnote = {DIAG, RADIOLOGY},
@@ -17236,10 +17264,10 @@ @conference{Leeu21e
   author = {van Leeuwen, Kicky G. and de Rooij, Maarten and Rutten, Matthieu J.C.M. and van Ginneken, Bram and Schalekamp, Steven},
   title = {Performance Of A Commercial Software Package For Lung Nodule Detection On Chest Radiographs Compared With 8 Expert Readers},
   abstract = {Purpose: Multi-center evaluation of the stand-alone performance of commercially available lung nodule detectionsoftware (Lunit INSIGHT CXR3).
-                          Methods and Materials: A set of 300 posteroanterior (PA) and lateral chest radiographs from four medical centers in theNetherlands was collected. Solitary lung nodules ranging from 5 to 35 mm in size were present in 111 of the cases. Allnodules were confirmed by CT within three months of the radiograph acquisition. Control radiographs were determinedbased on a negative CT within six months. Five radiologists and three radiology residents scored the set to provide contextto the algorithm performance. All PA radiographs were processed by Lunit INSIGHT CXR3, a commercial software productthat detects ten common abnormalities in chest radiographs. Area under the receiver operating characteristics curve (AUC)and sensitivity at 90% specificity were used to measure performance. Multi-reader multi-case ROC analysis based on U-statistics (iMRMC-v4 software) was applied to compare CXR3 with the readers. Subanalysis was performed regardingnodule size (small<15mm, large>15mm) and conspicuity levels (well visible, moderately visible, subtle, very subtle).
-                          Results: Out of the 300 radiographs, 7 could not be processed by CXR3, resulting in a set of 104 nodule cases and 189normal cases for evaluation. The CXR3 AUC was 0.93 and significantly higher than the mean reader AUC of 0.82 (p<0.001).CXR3 was also significantly better than the best reader with an AUC of 0.88 (p=0.028). At a specificity level of 90%,sensitivity was 83.2% for CXR3 and 63.3% (std+-7.5%) for the reader average. Regarding conspicuity of the nodules, CXR3AUCs were 0.99 for well visible, 0.94 for moderately visible, 0.94 for subtle, and 0.78 for very subtle nodules. No significantdifference in CXR3 performance was observed between the detection of small (AUC 0.91) and large nodules (AUC 0.93).
-                          Conclusions: Lunit INSIGHT CXR3 significantly outperforms the comparison group of eight readers in nodule detection onchest radiographs.
-                          Clinical Relevance/Application: Generalizability of artificial intelligence algorithms is not trivial. Performance studiesincrease confidence in algorithms to the users, especially to those with similar patient populations.},
+                            Methods and Materials: A set of 300 posteroanterior (PA) and lateral chest radiographs from four medical centers in theNetherlands was collected. Solitary lung nodules ranging from 5 to 35 mm in size were present in 111 of the cases. Allnodules were confirmed by CT within three months of the radiograph acquisition. Control radiographs were determinedbased on a negative CT within six months. Five radiologists and three radiology residents scored the set to provide contextto the algorithm performance. All PA radiographs were processed by Lunit INSIGHT CXR3, a commercial software productthat detects ten common abnormalities in chest radiographs. Area under the receiver operating characteristics curve (AUC)and sensitivity at 90% specificity were used to measure performance. Multi-reader multi-case ROC analysis based on U-statistics (iMRMC-v4 software) was applied to compare CXR3 with the readers. Subanalysis was performed regardingnodule size (small<15mm, large>15mm) and conspicuity levels (well visible, moderately visible, subtle, very subtle).
+                            Results: Out of the 300 radiographs, 7 could not be processed by CXR3, resulting in a set of 104 nodule cases and 189normal cases for evaluation. The CXR3 AUC was 0.93 and significantly higher than the mean reader AUC of 0.82 (p<0.001).CXR3 was also significantly better than the best reader with an AUC of 0.88 (p=0.028). At a specificity level of 90%,sensitivity was 83.2% for CXR3 and 63.3% (std+-7.5%) for the reader average. Regarding conspicuity of the nodules, CXR3AUCs were 0.99 for well visible, 0.94 for moderately visible, 0.94 for subtle, and 0.78 for very subtle nodules. No significantdifference in CXR3 performance was observed between the detection of small (AUC 0.91) and large nodules (AUC 0.93).
+                            Conclusions: Lunit INSIGHT CXR3 significantly outperforms the comparison group of eight readers in nodule detection onchest radiographs.
+                            Clinical Relevance/Application: Generalizability of artificial intelligence algorithms is not trivial. Performance studiesincrease confidence in algorithms to the users, especially to those with similar patient populations.},
   booktitle = RSNA,
   year = {2021},
   optnote = {DIAG, RADIOLOGY},
@@ -17249,10 +17277,10 @@ @conference{Leeu21f
   author = {van Leeuwen, Kicky G. and de Rooij, Maarten and Rutten, Matthieu J.C.M. and Schalekamp, Steven and van Ginneken, Bram},
   title = {Commercial Artificial Intelligence Solutions For Radiology: A Market Update},
   abstract = {Purpose: Provide an overview of the current market of regulatory-cleared artificial intelligence (AI) software for radiology.
-                          Methods and Materials: An overview of CE marked AI products for clinical radiology is maintained online (https://www.AIforRadiology.com). Vendors were asked to verify and complete the product information. This overviewallows for analysis of market trends. Characteristics of the market were based on the state of the database on the 1st ofMay 2021.
-                          Results: In May 2021 there were 161 CE marked AI products on the market, an increase of 36% compared with one yearprior. The growth from 2019 to 2020 was 69% (from 70 to 118 products). The number of vendors offering AI products onlygrew with 13% from 61 in 2020 to 69 in 2021. The average number of products per company therefore increased from 1.9to 2.3. The time from company founding to the first product on the market is on average 4 years and 1 month. Mostprevalent are tools for neuro and chest imaging. With respect to modality, CT and MR covered 62% of all products. Half ofthe CE marked AI products (51%) have also been cleared by the FDA. To our knowledge, only four products were CEmarked under the new Medical Device Regulations. Subscription or licensing are the most popular pricing models. Themajority of products are offered with both the option of local and cloud-based installation.
-                          Conclusions: The growth of AI products new to the market is slowing down. This effect is even stronger for vendors.Existing vendors have been expanding their portfolios.
-                          Clinical Relevance/Application: The market of AI products for radiology is growing. Our research provides a transparentoverview of the available products and their evidence.},
+                            Methods and Materials: An overview of CE marked AI products for clinical radiology is maintained online (https://www.AIforRadiology.com). Vendors were asked to verify and complete the product information. This overviewallows for analysis of market trends. Characteristics of the market were based on the state of the database on the 1st ofMay 2021.
+                            Results: In May 2021 there were 161 CE marked AI products on the market, an increase of 36% compared with one yearprior. The growth from 2019 to 2020 was 69% (from 70 to 118 products). The number of vendors offering AI products onlygrew with 13% from 61 in 2020 to 69 in 2021. The average number of products per company therefore increased from 1.9to 2.3. The time from company founding to the first product on the market is on average 4 years and 1 month. Mostprevalent are tools for neuro and chest imaging. With respect to modality, CT and MR covered 62% of all products. Half ofthe CE marked AI products (51%) have also been cleared by the FDA. To our knowledge, only four products were CEmarked under the new Medical Device Regulations. Subscription or licensing are the most popular pricing models. Themajority of products are offered with both the option of local and cloud-based installation.
+                            Conclusions: The growth of AI products new to the market is slowing down. This effect is even stronger for vendors.Existing vendors have been expanding their portfolios.
+                            Clinical Relevance/Application: The market of AI products for radiology is growing. Our research provides a transparentoverview of the available products and their evidence.},
   booktitle = RSNA,
   year = {2021},
   optnote = {DIAG, RADIOLOGY},
@@ -17262,10 +17290,10 @@ @conference{Leeu22
   author = {van Leeuwen, Kicky G. and de Rooij, Maarten and Schalekamp, Steven and van Ginneken, Bram and Rutten, Matthieu J.C.M.},
   title = {The rise of artificial intelligence solutions in radiology departments in the Netherlands},
   abstract = {Purpose: There are over 180 CE-marked artificial intelligence (AI) products for radiology commercially available in Europe, but little is known about the current clinical use. We investigated the clinical use of commercial AI software in radiology departments in the Netherlands over a two-year period.
-                          Methods: We consulted the radiology department of all hospital organizations in the Netherlands (n=69) in February-March 2020 (44 respondents) and February-March 2021 (37 respondents). A representative of the department was asked to fill in a questionnaire about the (planned) clinical use of CE marked AI products for radiology, the available funding for AI, and biggest obstacles for implementation.
-                          Results: From 2020 to 2021 the percentage of respondents that desired the adoption of AI tools in radiology increased from 63% to 86%. In 2020, 14 responding organisations used AI in clinical practice, which increased to 23 (33% of all organizations) in 2021. The total number of AI implementations in clinical practice expanded by 157%, from 19 to 49 implementations. Also, the diversity increased from 8 to 32 unique products. In 2021, 35% of respondents had budgets allocated for AI implementations either on the departmental level or on the institutional level, which was 26% in 2020. The major obstacles for AI adoption shifted from difficulties with the technical integration (2020) to the lack of budgets and an unclear business case (2021). Technical integration remained the second most often listed obstacle.
-                          Conclusion: AI adoption is gradually increasing in clinical radiology in the Netherlands. The number of radiology departments using AI has increased to at least a third of all organizations. Also, the number and diversity of AI applications per department grew substantially.
-                          Limitations: Results may be influenced by a nonresponse bias.},
+                            Methods: We consulted the radiology department of all hospital organizations in the Netherlands (n=69) in February-March 2020 (44 respondents) and February-March 2021 (37 respondents). A representative of the department was asked to fill in a questionnaire about the (planned) clinical use of CE marked AI products for radiology, the available funding for AI, and biggest obstacles for implementation.
+                            Results: From 2020 to 2021 the percentage of respondents that desired the adoption of AI tools in radiology increased from 63% to 86%. In 2020, 14 responding organisations used AI in clinical practice, which increased to 23 (33% of all organizations) in 2021. The total number of AI implementations in clinical practice expanded by 157%, from 19 to 49 implementations. Also, the diversity increased from 8 to 32 unique products. In 2021, 35% of respondents had budgets allocated for AI implementations either on the departmental level or on the institutional level, which was 26% in 2020. The major obstacles for AI adoption shifted from difficulties with the technical integration (2020) to the lack of budgets and an unclear business case (2021). Technical integration remained the second most often listed obstacle.
+                            Conclusion: AI adoption is gradually increasing in clinical radiology in the Netherlands. The number of radiology departments using AI has increased to at least a third of all organizations. Also, the number and diversity of AI applications per department grew substantially.
+                            Limitations: Results may be influenced by a nonresponse bias.},
   booktitle = ECR,
   year = {2022},
   optnote = {DIAG, RADIOLOGY},
@@ -17275,9 +17303,9 @@ @conference{Leeu22a
   author = {Deden, Laura N. and van Leeuwen, Kicky G. and Becks, M.J. and Bernsen, M.L.E. and de Rooij, Maarten and Martens, J.M. and Meijer, F.J.A.},
   title = {Gluren bij de buren - Evaluating and sharing real-world experience of an AI stroke tool in two centres},
   abstract = {Background: Currently, many hospitals are implementing AI software. However, clear clinical implementation procedures are not yet available. In order to exchange experiences, two interventional stroke centres (Radboudumc and Rijnstate) collaborated in the prospective evaluation of an AI tool for stroke diagnostics.
-                          Methodology: Primary aim of StrokeViewer (Nicolab) implementation in both centres was diagnostic support in detecting large vessel occlusions (LVO) in anterior cerebral circulation. Additionally, in Rijnstate analysis of cerebral CT perfusion (CTP) was available. In Radboudumc, LVO results were available after log in to the StrokeViewer server. In Rijnstate, results were pushed to PACS as a pdf-report. Trial period in Radboudumc was 12 months, in Rijnstate 7 months. The performance of proximal LVO detection was compared with radiologists' assessments. Users filled in a questionnaire on user experience at several time points. In Radboudumc, the use was monitored by individual log-in information.
-                          Results: Quantitative evaluation of ICA, M1 and proximal M2 occlusion detection (prevalence 18%) resulted in a case base sensitivity and specificity of 74% and 91% in Rijnstate (n=276) and 77% and 91% in Radboudumc (n=516). The use of the tool decreased over time. Radiologists unfamiliar with LVO assessment tended to value the AI report more than experienced radiologists. The net promoter scores were -56% in Radboudumc and -65% in Rijnstate. The tool was considered user friendly (7.2/10). CTP assessment in Rijnstate was used more frequently than LVO detection.
-                          Conclusion: This evaluation aids to understand some of the challenges involved in clinical implementation and acceptance by users of AI tools. Findings are consistent for both centres. Success is not only dependent on the product and its performance, but also on clinical goal setting, expectations, context and implementation choices. Sharing experience within the NVvR AInetwork can help to gain insights into crucial factors for success ("Gluren-bij-de-buren").},
+                            Methodology: Primary aim of StrokeViewer (Nicolab) implementation in both centres was diagnostic support in detecting large vessel occlusions (LVO) in anterior cerebral circulation. Additionally, in Rijnstate analysis of cerebral CT perfusion (CTP) was available. In Radboudumc, LVO results were available after log in to the StrokeViewer server. In Rijnstate, results were pushed to PACS as a pdf-report. Trial period in Radboudumc was 12 months, in Rijnstate 7 months. The performance of proximal LVO detection was compared with radiologists' assessments. Users filled in a questionnaire on user experience at several time points. In Radboudumc, the use was monitored by individual log-in information.
+                            Results: Quantitative evaluation of ICA, M1 and proximal M2 occlusion detection (prevalence 18%) resulted in a case base sensitivity and specificity of 74% and 91% in Rijnstate (n=276) and 77% and 91% in Radboudumc (n=516). The use of the tool decreased over time. Radiologists unfamiliar with LVO assessment tended to value the AI report more than experienced radiologists. The net promoter scores were -56% in Radboudumc and -65% in Rijnstate. The tool was considered user friendly (7.2/10). CTP assessment in Rijnstate was used more frequently than LVO detection.
+                            Conclusion: This evaluation aids to understand some of the challenges involved in clinical implementation and acceptance by users of AI tools. Findings are consistent for both centres. Success is not only dependent on the product and its performance, but also on clinical goal setting, expectations, context and implementation choices. Sharing experience within the NVvR AInetwork can help to gain insights into crucial factors for success ("Gluren-bij-de-buren").},
   booktitle = {Radiologendagen},
   year = {2022},
   optnote = {DIAG, RADIOLOGY},
@@ -17288,10 +17316,10 @@ @conference{Leeu22b
   booktitle = ECR,
   title = {Real-world evaluation of artificial intelligence software for cerebral large vessel occlusion detection in {CT} angiography},
   abstract = {Purpose: The commercially available AI tool (StrokeViewer v2, Nicolab) supports the diagnostic process of stroke by detecting large vessel occlusions (LVO) on CTA. We prospectively evaluated this tool in our department to monitor safety and impact.
-                          Methods: We implemented the software with the goal to improve the diagnosis of LVO and elevate the diagnostic confidence of the radiologist (resident). We used quantitative measures (data from clinical systems, vendor log files) and qualitative measures (user survey) to analyse diagnostic performance, number of users, login attempts, radiologists' diagnostic confidence, and user experience.
-                          Results: In total, 226 CTAs with a clinical indication of stroke between January-June 2021 were prospectively evaluated. Thirteen cases of posterior circulation and distal vessel occlusions were excluded as they were outside the intended use of the AI tool. The AI tool missed 12 of the 36 occlusions in the middle cerebral or intracranial internal carotid artery (M1=1, M2=10, ICA=1) resulting in an accuracy of 86.4%. Irrespective of location, the sensitivity was 77.8% and specificity 90.4%. The number of monthly unique users varied between 8 and 24 radiologists/residents. Log in attempts dropped after the initial month (which included training) to a monthly average of 44 attempts. The diagnostic confidence did not increase during the use of the tool. The likelihood that users would recommend StrokeViewer to colleagues was rated 4.5/10.
-                          Conclusion: Over six months, the use of StrokeViewer dropped and users did not sense improvement of diagnostic confidence. Measures have been taken to stimulate adoption for the latter six months of the trial period.
-                          Limitation: Because of the prospective character, no comparison could be made between radiologists supported by AI vs radiologists without AI.},
+                            Methods: We implemented the software with the goal to improve the diagnosis of LVO and elevate the diagnostic confidence of the radiologist (resident). We used quantitative measures (data from clinical systems, vendor log files) and qualitative measures (user survey) to analyse diagnostic performance, number of users, login attempts, radiologists' diagnostic confidence, and user experience.
+                            Results: In total, 226 CTAs with a clinical indication of stroke between January-June 2021 were prospectively evaluated. Thirteen cases of posterior circulation and distal vessel occlusions were excluded as they were outside the intended use of the AI tool. The AI tool missed 12 of the 36 occlusions in the middle cerebral or intracranial internal carotid artery (M1=1, M2=10, ICA=1) resulting in an accuracy of 86.4%. Irrespective of location, the sensitivity was 77.8% and specificity 90.4%. The number of monthly unique users varied between 8 and 24 radiologists/residents. Log in attempts dropped after the initial month (which included training) to a monthly average of 44 attempts. The diagnostic confidence did not increase during the use of the tool. The likelihood that users would recommend StrokeViewer to colleagues was rated 4.5/10.
+                            Conclusion: Over six months, the use of StrokeViewer dropped and users did not sense improvement of diagnostic confidence. Measures have been taken to stimulate adoption for the latter six months of the trial period.
+                            Limitation: Because of the prospective character, no comparison could be made between radiologists supported by AI vs radiologists without AI.},
   optnote = {DIAG, RADIOLOGY},
   year = {2022},
 }
@@ -17301,10 +17329,10 @@ @conference{Leeu23
   booktitle = ECR,
   title = {Potential risk of off-label use of commercially available AI-based software for radiology},
   abstract = {Purpose or Learning Objective: The aim of this study was to analyse potential discrepancies between the claims and disclaimers of the intended purpose statements of CE-marked AI-based software for radiology.
-                          Methods or Background: In March 2022, we asked all vendors listed on www.AIforRadiology.com (n=87) to verify or submit the intended purpose according to European clearance for their products (n=191). Any new additions were included until September 26th 2022 (n=12)). Claims and disclaimers were extracted from the statements. Potential conflicts of claims and disclaimers were flagged.
-                          Results or Findings: We received the intended purpose statements for 157 of the 203 products. Of those, 36 were excluded as they provided too little information to analyse. The included products were certified under the current medical device regulations (class IIa = 24, class IIb = 9) and former Medical Device Directive (class I = 45, class IIa = 39, class IIb = 3). Of the 121 included statements 56 held disclaimers. For 13 of these products the claims and disclaimers were flagged to contradict each other. Potential discrepant disclaimer statements were e.g. 'act per the standard of care' (n=7) and 'not for diagnostic use' (n=6), while claiming to aid in the diagnosis, triaging or risk scoring of clinical conditions.
-                          Conclusion: Potential discrepancies in claims and disclaimers were found for a substantial number of AI-tools bearing the risk that users of the AI software misunderstand the permitted use-cases which may lead to off-label use.
-                          Limitations: Not all intended purpose statements received were of sufficient quality to use for analysis. The definition of what information the intended purpose should contain is not clearly specified under the MDR making it hard to objectively assess or compare.},
+                            Methods or Background: In March 2022, we asked all vendors listed on www.AIforRadiology.com (n=87) to verify or submit the intended purpose according to European clearance for their products (n=191). Any new additions were included until September 26th 2022 (n=12)). Claims and disclaimers were extracted from the statements. Potential conflicts of claims and disclaimers were flagged.
+                            Results or Findings: We received the intended purpose statements for 157 of the 203 products. Of those, 36 were excluded as they provided too little information to analyse. The included products were certified under the current medical device regulations (class IIa = 24, class IIb = 9) and former Medical Device Directive (class I = 45, class IIa = 39, class IIb = 3). Of the 121 included statements 56 held disclaimers. For 13 of these products the claims and disclaimers were flagged to contradict each other. Potential discrepant disclaimer statements were e.g. 'act per the standard of care' (n=7) and 'not for diagnostic use' (n=6), while claiming to aid in the diagnosis, triaging or risk scoring of clinical conditions.
+                            Conclusion: Potential discrepancies in claims and disclaimers were found for a substantial number of AI-tools bearing the risk that users of the AI software misunderstand the permitted use-cases which may lead to off-label use.
+                            Limitations: Not all intended purpose statements received were of sufficient quality to use for analysis. The definition of what information the intended purpose should contain is not clearly specified under the MDR making it hard to objectively assess or compare.},
   optnote = {DIAG, RADIOLOGY},
   year = {2023},
 }
@@ -17315,25 +17343,25 @@ @article{Leeu23a
   doi = {10.1007/s00330-023-09991-5},
   url = {http://dx.doi.org/10.1007/s00330-023-09991-5},
   abstract = {Abstract
-                                        Objectives
-                                        To map the clinical use of CE-marked artificial intelligence (AI)-based software in radiology departments in the Netherlands (n = 69) between 2020 and 2022.
-
-                                        Materials and methods
-                                        Our AI network (one radiologist or AI representative per Dutch hospital organization) received a questionnaire each spring from 2020 to 2022 about AI product usage, financing, and obstacles to adoption. Products that were not listed on <jats:ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://www.aiforradiology.com">www.AIforRadiology.com</jats:ext-link> by July 2022 were excluded from the analysis.
-
-                                        Results
-                                        The number of respondents was 43 in 2020, 36 in 2021, and 33 in 2022. The number of departments using AI has been growing steadily (2020: 14, 2021: 19, 2022: 23). The diversity (2020: 7, 2021: 18, 2022: 34) and the number of total implementations (2020: 19, 2021: 38, 2022: 68) has rapidly increased. Seven implementations were discontinued in 2022. Four hospital organizations said to use an AI platform or marketplace for the deployment of AI solutions. AI is mostly used to support chest CT (17), neuro CT (17), and musculoskeletal radiograph (12) analysis. The budget for AI was reserved in 13 of the responding centers in both 2021 and 2022. The most important obstacles to the adoption of AI remained costs and IT integration. Of the respondents, 28% stated that the implemented AI products realized health improvement and 32% assumed both health improvement and cost savings.
-
-                                        Conclusion
-                                        The adoption of AI products in radiology departments in the Netherlands is showing common signs of a developing market. The major obstacles to reaching widespread adoption are a lack of financial resources and IT integration difficulties.
-
-                                        Clinical relevance statement
-                                        The clinical impact of AI starts with its adoption in daily clinical practice. Increased transparency around AI products being adopted, implementation obstacles, and impact may inspire increased collaboration and improved decision-making around the implementation and financing of AI products.
-
-                                        Key Points
-                                        The adoption of artificial intelligence products for radiology has steadily increased since 2020 to at least a third of the centers using AI in clinical practice in the Netherlands in 2022.
-                                        The main areas in which artificial intelligence products are used are lung nodule detection on CT, aided stroke diagnosis, and bone age prediction.
-                                        The majority of respondents experienced added value (decreased costs and/or improved outcomes) from using artificial intelligence-based software; however, major obstacles to adoption remain the costs and IT-related difficulties.},
+                                          Objectives
+                                          To map the clinical use of CE-marked artificial intelligence (AI)-based software in radiology departments in the Netherlands (n = 69) between 2020 and 2022.
+  
+                                          Materials and methods
+                                          Our AI network (one radiologist or AI representative per Dutch hospital organization) received a questionnaire each spring from 2020 to 2022 about AI product usage, financing, and obstacles to adoption. Products that were not listed on <jats:ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://www.aiforradiology.com">www.AIforRadiology.com</jats:ext-link> by July 2022 were excluded from the analysis.
+  
+                                          Results
+                                          The number of respondents was 43 in 2020, 36 in 2021, and 33 in 2022. The number of departments using AI has been growing steadily (2020: 14, 2021: 19, 2022: 23). The diversity (2020: 7, 2021: 18, 2022: 34) and the number of total implementations (2020: 19, 2021: 38, 2022: 68) has rapidly increased. Seven implementations were discontinued in 2022. Four hospital organizations said to use an AI platform or marketplace for the deployment of AI solutions. AI is mostly used to support chest CT (17), neuro CT (17), and musculoskeletal radiograph (12) analysis. The budget for AI was reserved in 13 of the responding centers in both 2021 and 2022. The most important obstacles to the adoption of AI remained costs and IT integration. Of the respondents, 28% stated that the implemented AI products realized health improvement and 32% assumed both health improvement and cost savings.
+  
+                                          Conclusion
+                                          The adoption of AI products in radiology departments in the Netherlands is showing common signs of a developing market. The major obstacles to reaching widespread adoption are a lack of financial resources and IT integration difficulties.
+  
+                                          Clinical relevance statement
+                                          The clinical impact of AI starts with its adoption in daily clinical practice. Increased transparency around AI products being adopted, implementation obstacles, and impact may inspire increased collaboration and improved decision-making around the implementation and financing of AI products.
+  
+                                          Key Points
+                                          The adoption of artificial intelligence products for radiology has steadily increased since 2020 to at least a third of the centers using AI in clinical practice in the Netherlands in 2022.
+                                          The main areas in which artificial intelligence products are used are lung nodule detection on CT, aided stroke diagnosis, and bone age prediction.
+                                          The majority of respondents experienced added value (decreased costs and/or improved outcomes) from using artificial intelligence-based software; however, major obstacles to adoption remain the costs and IT-related difficulties.},
   citation-count = {0},
   file = {Leeu23a.pdf:pdf\Leeu23a.pdf:PDF},
   journal = {European Radiology},
@@ -17349,13 +17377,13 @@ @article{Leeu23b
   title = {AI-support for the detection of intracranial large vessel occlusions: One-year prospective evaluation},
   doi = {10.1016/j.heliyon.2023.e19065},
   abstract = {Purpose
-                          Few studies have evaluated real-world performance of radiological AI-tools in clinical practice. Over one-year, we prospectively evaluated the use of AI software to support the detection of intracranial large vessel occlusions (LVO) on CT angiography (CTA).
-                          Method
-                          Quantitative measures (user log-in attempts, AI standalone performance) and qualitative data (user surveys) were reviewed by a key-user group at three timepoints. A total of 491 CTA studies of 460 patients were included for analysis.
-                          Results
-                          The overall accuracy of the AI-tool for LVO detection and localization was 87.6\%, sensitivity 69.1\% and specificity 91.2\%. Out of 81 LVOs, 31 of 34 (91\%) M1 occlusions were detected correctly, 19 of 38 (50\%) M2 occlusions, and 6 of 9 (67\%) ICA occlusions. The product was considered user-friendly. The diagnostic confidence of the users for LVO detection remained the same over the year. The last measured net promotor score was -56\%. The use of the AI-tool fluctuated over the year with a declining trend.
-                          Conclusions
-                          Our pragmatic approach of evaluating the AI-tool used in clinical practice, helped us to monitor the usage, to estimate the perceived added value by the users of the AI-tool, and to make an informed decision about the continuation of the use of the AI-tool.},
+                            Few studies have evaluated real-world performance of radiological AI-tools in clinical practice. Over one-year, we prospectively evaluated the use of AI software to support the detection of intracranial large vessel occlusions (LVO) on CT angiography (CTA).
+                            Method
+                            Quantitative measures (user log-in attempts, AI standalone performance) and qualitative data (user surveys) were reviewed by a key-user group at three timepoints. A total of 491 CTA studies of 460 patients were included for analysis.
+                            Results
+                            The overall accuracy of the AI-tool for LVO detection and localization was 87.6\%, sensitivity 69.1\% and specificity 91.2\%. Out of 81 LVOs, 31 of 34 (91\%) M1 occlusions were detected correctly, 19 of 38 (50\%) M2 occlusions, and 6 of 9 (67\%) ICA occlusions. The product was considered user-friendly. The diagnostic confidence of the users for LVO detection remained the same over the year. The last measured net promotor score was -56\%. The use of the AI-tool fluctuated over the year with a declining trend.
+                            Conclusions
+                            Our pragmatic approach of evaluating the AI-tool used in clinical practice, helped us to monitor the usage, to estimate the perceived added value by the users of the AI-tool, and to make an informed decision about the continuation of the use of the AI-tool.},
   citation-count = {0},
   file = {Leeu23b.pdf:pdf\Leeu23b.pdf:PDF},
   journal = {Heliyon},
@@ -17373,11 +17401,11 @@ @phdthesis{Leeu23c
   title = {Validation and implementation of commercial artificial intelligence software for radiology},
   url = {https://repository.ubn.ru.nl/handle/2066/295128},
   abstract = {The aim of this thesis is to increase transparency of the AI software applications for the radiology market: the medical specialty which currently covers 75% of all approved medical AI software. The focus is on products available for clinical use in Europe, in other words, products that are CE marked. We discuss the potential use cases of AI in radiology, map commercially available AI products, independently assess the performance of products, and measure and model the (potential) added value. With the insights we have gained and publicly shared, we enable more informed decision-making by AI purchasers, users, investors, and creators. Furthermore, it encourages use and development of AI that is safe and of value to society.
-                          The key contributions of this research are:
-                          - Three years of publicly sharing of information to a global audience on commercially available AI products, verified regulatory clearance information, product specifications, and scientific evidence, through www.AIforRadiology.com and associated monthly newsletter.
-                          - Initiating the Dutch Radiology AI-network connecting "AI-champions" among hospitals to share experiences and to enable the yearly inquiry on the clinical use of commercial AI.
-                          - Development of a framework for the independent and objective validation of commercially available AI products and applying this to ten products, for two different use cases, on data from seven medical centers. With this framework, we make validation more efficient and impartial, enabling informed purchasing or reimbursement decisions.
-                          - One of the first demonstrations of how an early health technology assessment can be performed to demonstrate the value of an AI product before implementation.},
+                            The key contributions of this research are:
+                            - Three years of publicly sharing of information to a global audience on commercially available AI products, verified regulatory clearance information, product specifications, and scientific evidence, through www.AIforRadiology.com and associated monthly newsletter.
+                            - Initiating the Dutch Radiology AI-network connecting "AI-champions" among hospitals to share experiences and to enable the yearly inquiry on the clinical use of commercial AI.
+                            - Development of a framework for the independent and objective validation of commercially available AI products and applying this to ten products, for two different use cases, on data from seven medical centers. With this framework, we make validation more efficient and impartial, enabling informed purchasing or reimbursement decisions.
+                            - One of the first demonstrations of how an early health technology assessment can be performed to demonstrate the value of an AI product before implementation.},
   copromotor = {M.J.C.M. Rutten, Dr. M. de Rooij, S. Schalekamp},
   file = {:pdf/Leeu23c.pdf:PDF},
   journal = {PhD thesis},
@@ -17415,12 +17443,12 @@ @article{Leij17
   pages = {1569-1577},
   doi = {10.1212/WNL.0000000000004490},
   abstract = {Objective: To investigate the temporal dynamics of cerebral small vessel disease (SVD) by 3 consecutive assessments over a period of 9 years, distinguishing progression from regression.
-
-                                                       Methods: Changes in SVD markers of 276 participants of the Radboud University Nijmegen Diffusion Tensor and Magnetic Resonance Imaging Cohort (RUN DMC) cohort were assessed at 3 time points over 9 years. We assessed white matter hyperintensities (WMH) volume by semiautomatic segmentation and rated lacunes and microbleeds manually. We categorized baseline WMH severity as mild, moderate, or severe according to the modified Fazekas scale. We performed mixed-effects regression analysis including a quadratic term for increasing age.
-
-                                                       Results: Mean WMH progression over 9 years was 4.7 mL (0.54 mL/y; interquartile range 0.95-5.5 mL), 20.3% of patients had incident lacunes (2.3%/y), and 18.9% had incident microbleeds (2.2%/y). WMH volume declined in 9.4% of the participants during the first follow-up interval, but only for 1 participant (0.4%) throughout the whole follow-up. Lacunes disappeared in 3.6% and microbleeds in 5.7% of the participants. WMH progression accelerated over time: including a quadratic term for increasing age during follow-up significantly improved the model (p < 0.001). SVD progression was predominantly seen in participants with moderate to severe WMH at baseline compared to those with mild WMH (odds ratio [OR] 35.5, 95% confidence interval [CI] 15.8-80.0, p < 0.001 for WMH progression; OR 5.7, 95% CI 2.8-11.2, p < 0.001 for incident lacunes; and OR 2.9, 95% CI 1.4-5.9, p = 0.003 for incident microbleeds).
-
-                                                       Conclusions: SVD progression is nonlinear, accelerating over time, and a highly dynamic process, with progression interrupted by reduction in some, in a population that on average shows progression.},
+  
+                                                         Methods: Changes in SVD markers of 276 participants of the Radboud University Nijmegen Diffusion Tensor and Magnetic Resonance Imaging Cohort (RUN DMC) cohort were assessed at 3 time points over 9 years. We assessed white matter hyperintensities (WMH) volume by semiautomatic segmentation and rated lacunes and microbleeds manually. We categorized baseline WMH severity as mild, moderate, or severe according to the modified Fazekas scale. We performed mixed-effects regression analysis including a quadratic term for increasing age.
+  
+                                                         Results: Mean WMH progression over 9 years was 4.7 mL (0.54 mL/y; interquartile range 0.95-5.5 mL), 20.3% of patients had incident lacunes (2.3%/y), and 18.9% had incident microbleeds (2.2%/y). WMH volume declined in 9.4% of the participants during the first follow-up interval, but only for 1 participant (0.4%) throughout the whole follow-up. Lacunes disappeared in 3.6% and microbleeds in 5.7% of the participants. WMH progression accelerated over time: including a quadratic term for increasing age during follow-up significantly improved the model (p < 0.001). SVD progression was predominantly seen in participants with moderate to severe WMH at baseline compared to those with mild WMH (odds ratio [OR] 35.5, 95% confidence interval [CI] 15.8-80.0, p < 0.001 for WMH progression; OR 5.7, 95% CI 2.8-11.2, p < 0.001 for incident lacunes; and OR 2.9, 95% CI 1.4-5.9, p = 0.003 for incident microbleeds).
+  
+                                                         Conclusions: SVD progression is nonlinear, accelerating over time, and a highly dynamic process, with progression interrupted by reduction in some, in a population that on average shows progression.},
   file = {Leij17.pdf:pdf\\Leij17.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {28878046},
@@ -17458,19 +17486,19 @@ @article{Leij18a
   url = {http://dx.doi.org/10.1161/STROKEAHA.118.020980},
   volume = {49},
   abstract = {Background and Purpose--
-                                     White matter hyperintensities (WMH) are frequently seen on neuroimaging of elderly and are associated with cognitive decline and the development of dementia. Yet, the temporal dynamics of conversion of normal-appearing white matter (NAWM) into WMH remains unknown. We examined whether and when progression of WMH was preceded by changes in fluid-attenuated inversion recovery and diffusion tensor imaging values, thereby taking into account differences between participants with mild versus severe baseline WMH.
-
-
-                                     Methods--
-                                     From 266 participants of the RUN DMC study (Radboud University Nijmegen Diffusion Tensor and Magnetic Resonance Imaging Cohort), we semiautomatically segmented WMH at 3 time points for 9 years. Images were registered to standard space through a subject template. We analyzed differences in baseline fluid-attenuated inversion recovery, fractional anisotropy, and mean diffusivity (MD) values and changes in MD values over time between 4 regions: (1) remaining NAWM, (2) NAWM converting into WMH in the second follow-up period, (3) NAWM converting into WMH in the first follow-up period, and (4) WMH.
-
-
-                                     Results--
-                                     NAWM converting into WMH in the first or second time interval showed higher fluid-attenuated inversion recovery and MD values than remaining NAWM. MD values in NAWM converting into WMH in the first time interval were similar to MD values in WMH. When stratified by baseline WMH severity, participants with severe WMH had higher fluid-attenuated inversion recovery and MD and lower fractional anisotropy values than participants with mild WMH, in all areas including the NAWM. MD values in WMH and in NAWM that converted into WMH continuously increased over time.
-
-
-                                     Conclusions--
-                                     Impaired microstructural integrity preceded conversion into WMH and continuously declined over time, suggesting a continuous disease process of white matter integrity loss that can be detected using diffusion tensor imaging even years before WMH become visible on conventional neuroimaging. Differences in microstructural integrity between participants with mild versus severe WMH suggest heterogeneity of both NAWM and WMH, which might explain the clinical variability observed in patients with similar small vessel disease severity.},
+                                       White matter hyperintensities (WMH) are frequently seen on neuroimaging of elderly and are associated with cognitive decline and the development of dementia. Yet, the temporal dynamics of conversion of normal-appearing white matter (NAWM) into WMH remains unknown. We examined whether and when progression of WMH was preceded by changes in fluid-attenuated inversion recovery and diffusion tensor imaging values, thereby taking into account differences between participants with mild versus severe baseline WMH.
+  
+  
+                                       Methods--
+                                       From 266 participants of the RUN DMC study (Radboud University Nijmegen Diffusion Tensor and Magnetic Resonance Imaging Cohort), we semiautomatically segmented WMH at 3 time points for 9 years. Images were registered to standard space through a subject template. We analyzed differences in baseline fluid-attenuated inversion recovery, fractional anisotropy, and mean diffusivity (MD) values and changes in MD values over time between 4 regions: (1) remaining NAWM, (2) NAWM converting into WMH in the second follow-up period, (3) NAWM converting into WMH in the first follow-up period, and (4) WMH.
+  
+  
+                                       Results--
+                                       NAWM converting into WMH in the first or second time interval showed higher fluid-attenuated inversion recovery and MD values than remaining NAWM. MD values in NAWM converting into WMH in the first time interval were similar to MD values in WMH. When stratified by baseline WMH severity, participants with severe WMH had higher fluid-attenuated inversion recovery and MD and lower fractional anisotropy values than participants with mild WMH, in all areas including the NAWM. MD values in WMH and in NAWM that converted into WMH continuously increased over time.
+  
+  
+                                       Conclusions--
+                                       Impaired microstructural integrity preceded conversion into WMH and continuously declined over time, suggesting a continuous disease process of white matter integrity loss that can be detected using diffusion tensor imaging even years before WMH become visible on conventional neuroimaging. Differences in microstructural integrity between participants with mild versus severe WMH suggest heterogeneity of both NAWM and WMH, which might explain the clinical variability observed in patients with similar small vessel disease severity.},
   all_ss_ids = {[2b2ee4622512c58b3557448b11b9bcd7bcfbbbfa]},
   automatic = {yes},
   citation-count = {62},
@@ -17544,15 +17572,15 @@ @conference{Leon23
   doi = {10.1158/1538-7445.sabcs22-p2-11-34},
   year = {2023},
   abstract = {Abstract
-                                         Background: Triple-negative breast cancers (TNBC) exhibit high rates of recurrence and mortality. However, recent studies suggest that a subset of patients (pts) with early-stage TNBC enriched in tumor-infiltrating lymphocytes (TILs) have excellent clinical outcomes even in the absence of systemic therapy. Additional histological biomarkers that could identify pts for future systemic therapy escalation/de-escalation strategies are of great interest. TNBC are frequently highly proliferative with abundant mitoses. However, classic markers of proliferation (manual mitosis counting and Ki-67) appear to offer no prognostic value. Here, we evaluated the prognostic effects of automated mitotic spindle hotspot (AMSH) counting on RFS in independent cohorts of systemically untreated early-stage TNBC.
-                                         Methods: AMSH counting was conducted with a state-of-the-art deep learning algorithm trained on the detection of mitoses within 2 mm2 areas with the highest mitotic density (i.e. hotspots) in digital H&amp;E images. Details of the development, training and validation of the algorithm were published previously [1] in a cohort of unselected TNBC. We obtained AMSH counts in a centrally confirmed TNBC cohort from Mayo Clinic [2] and focused our analysis on pts who received locoregional therapy but no systemic therapy. Using a fractional polynomial analysis with a multivariable proportional hazards regression model, we confirmed the assumption of linearity in the log hazard for the continuous variable AMSH and evaluated whether AMSH counts were prognostic of RFS. We corroborated our findings in an independent cohort of systemically untreated TNBC pts from the Radboud University Medical Center in the Netherlands (Radboud Cohort). Results are reported at a median follow-up of 8.1 and 6.7 years for the Mayo and Netherlands cohorts, respectively.
-                                         Results: Among 182 pts with who did not receive systemic therapy in the Mayo Cohort, 140 (77\%) with available AMSH counts were included. The mean age was 61 (range: 31-94), 71\% were postmenopausal, 67\% had tumors <= 2cm, and 83\% were node-negative. As expected, most tumors were Nottingham grade 3 (84\%) and had a high Ki-67 proliferation index (54\% with Ki-67 &amp;gt;30\%). Most tumors (73\%) had stromal TILs <= 30\%. The median AMSH count was 18 (IQR: 8, 42). AMSH counts were linearly associated with grade and tumor size, with the proportion of pts with grade 3 tumors and size &amp;gt; 2 cm increasing as the AMSH counts increased (p=0.007 and p=0.059, respectively). In a multivariate model controlling for nodal status, tumor size, and stromal TILs, AMSH counts were independently associated with RFS (p&amp;lt; 0.0001). For every 10-point increase in the AMSH count, we observed a 17\% increase in the risk of experiencing an RFS event (HR 1.17, 95\% CI 1.08-1.26). We corroborated our findings in the Radboud Cohort (n=126). The mean age was 68 (range: 40-96), and 81\% were node-negative. While the median AMSH count was 36 (IQR: 16-63), higher than in the Mayo Cohort (p=0.004), the prognostic impact was similar, with a significant association between AMSH count and RFS (p=0.028) in a multivariate model corrected for nodal status, tumor size, and stromal TILs. For every 10-point increase in the AMSH count in the Netherlands cohort, we observed a 9\% increase in the risk of experiencing an RFS event (HR 1.09, 95\% CI 1.01-1.17). RFS rates according to AMSH counts for both cohorts are shown in the Table.
-                                         Conclusions: AMSH counting is a new proliferation biomarker that provides prognostic value independent of nodal status, tumor size, and stromal TILs in systemically untreated early-stage TNBC. Plans are underway to evaluate AMSH counts in additional cohorts of systemically untreated TNBC, and in other disease settings such as prior to neoadjuvant systemic therapy. If validated, this biomarker should be prospectively evaluated as a potential selection biomarker in clinical trials of systemic therapy de-escalation.
-                                         References:
-                                         1. PMID: 29994086
-                                         2. PMID: 28913760
-                                         Table RFS according to AMSH counts in the Mayo and Radboud Cohorts
-                                         Citation Format: Roberto A. Leon-Ferre, Jodi M. Carter, David Zahrieh, Jason P. Sinnwell, Roberto Salgado, Vera Suman, David Hillman, Judy C. Boughey, Krishna R. Kalari, Fergus J. Couch, James N. Ingle, Maschenka Balkenkohl, Francesco Ciompi, Jeroen van der Laak, Matthew P. Goetz. Mitotic spindle hotspot counting using deep learning networks is highly associated with clinical outcomes in patients with early-stage triple-negative breast cancer who did not receive systemic therapy [abstract]. In: Proceedings of the 2022 San Antonio Breast Cancer Symposium; 2022 Dec 6-10; San Antonio, TX. Philadelphia (PA): AACR; Cancer Res 2023;83(5 Suppl):Abstract nr P2-11-34.},
+                                           Background: Triple-negative breast cancers (TNBC) exhibit high rates of recurrence and mortality. However, recent studies suggest that a subset of patients (pts) with early-stage TNBC enriched in tumor-infiltrating lymphocytes (TILs) have excellent clinical outcomes even in the absence of systemic therapy. Additional histological biomarkers that could identify pts for future systemic therapy escalation/de-escalation strategies are of great interest. TNBC are frequently highly proliferative with abundant mitoses. However, classic markers of proliferation (manual mitosis counting and Ki-67) appear to offer no prognostic value. Here, we evaluated the prognostic effects of automated mitotic spindle hotspot (AMSH) counting on RFS in independent cohorts of systemically untreated early-stage TNBC.
+                                           Methods: AMSH counting was conducted with a state-of-the-art deep learning algorithm trained on the detection of mitoses within 2 mm2 areas with the highest mitotic density (i.e. hotspots) in digital H&amp;E images. Details of the development, training and validation of the algorithm were published previously [1] in a cohort of unselected TNBC. We obtained AMSH counts in a centrally confirmed TNBC cohort from Mayo Clinic [2] and focused our analysis on pts who received locoregional therapy but no systemic therapy. Using a fractional polynomial analysis with a multivariable proportional hazards regression model, we confirmed the assumption of linearity in the log hazard for the continuous variable AMSH and evaluated whether AMSH counts were prognostic of RFS. We corroborated our findings in an independent cohort of systemically untreated TNBC pts from the Radboud University Medical Center in the Netherlands (Radboud Cohort). Results are reported at a median follow-up of 8.1 and 6.7 years for the Mayo and Netherlands cohorts, respectively.
+                                           Results: Among 182 pts with who did not receive systemic therapy in the Mayo Cohort, 140 (77\%) with available AMSH counts were included. The mean age was 61 (range: 31-94), 71\% were postmenopausal, 67\% had tumors <= 2cm, and 83\% were node-negative. As expected, most tumors were Nottingham grade 3 (84\%) and had a high Ki-67 proliferation index (54\% with Ki-67 &amp;gt;30\%). Most tumors (73\%) had stromal TILs <= 30\%. The median AMSH count was 18 (IQR: 8, 42). AMSH counts were linearly associated with grade and tumor size, with the proportion of pts with grade 3 tumors and size &amp;gt; 2 cm increasing as the AMSH counts increased (p=0.007 and p=0.059, respectively). In a multivariate model controlling for nodal status, tumor size, and stromal TILs, AMSH counts were independently associated with RFS (p&amp;lt; 0.0001). For every 10-point increase in the AMSH count, we observed a 17\% increase in the risk of experiencing an RFS event (HR 1.17, 95\% CI 1.08-1.26). We corroborated our findings in the Radboud Cohort (n=126). The mean age was 68 (range: 40-96), and 81\% were node-negative. While the median AMSH count was 36 (IQR: 16-63), higher than in the Mayo Cohort (p=0.004), the prognostic impact was similar, with a significant association between AMSH count and RFS (p=0.028) in a multivariate model corrected for nodal status, tumor size, and stromal TILs. For every 10-point increase in the AMSH count in the Netherlands cohort, we observed a 9\% increase in the risk of experiencing an RFS event (HR 1.09, 95\% CI 1.01-1.17). RFS rates according to AMSH counts for both cohorts are shown in the Table.
+                                           Conclusions: AMSH counting is a new proliferation biomarker that provides prognostic value independent of nodal status, tumor size, and stromal TILs in systemically untreated early-stage TNBC. Plans are underway to evaluate AMSH counts in additional cohorts of systemically untreated TNBC, and in other disease settings such as prior to neoadjuvant systemic therapy. If validated, this biomarker should be prospectively evaluated as a potential selection biomarker in clinical trials of systemic therapy de-escalation.
+                                           References:
+                                           1. PMID: 29994086
+                                           2. PMID: 28913760
+                                           Table RFS according to AMSH counts in the Mayo and Radboud Cohorts
+                                           Citation Format: Roberto A. Leon-Ferre, Jodi M. Carter, David Zahrieh, Jason P. Sinnwell, Roberto Salgado, Vera Suman, David Hillman, Judy C. Boughey, Krishna R. Kalari, Fergus J. Couch, James N. Ingle, Maschenka Balkenkohl, Francesco Ciompi, Jeroen van der Laak, Matthew P. Goetz. Mitotic spindle hotspot counting using deep learning networks is highly associated with clinical outcomes in patients with early-stage triple-negative breast cancer who did not receive systemic therapy [abstract]. In: Proceedings of the 2022 San Antonio Breast Cancer Symposium; 2022 Dec 6-10; San Antonio, TX. Philadelphia (PA): AACR; Cancer Res 2023;83(5 Suppl):Abstract nr P2-11-34.},
   url = {http://dx.doi.org/10.1158/1538-7445.SABCS22-P2-11-34},
   file = {Leon23.pdf:pdf\Leon23.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -17686,12 +17714,12 @@ @inproceedings{Less16
   pages = {978511-1 -- 978511-6},
   doi = {10.1117/12.2216978},
   abstract = {The amount of calcifications in the coronary arteries is a powerful and independent predictor of cardiovascular events and is used to identify subjects at high risk who might benefit from preventive treatment. Routine quantification of coronary calcium scores can complement screening programs using low-dose chest CT, such as lung cancer screening. We present a system for automatic coronary calcium scoring based on deep convolutional neural networks (CNNs).
-
-                                                       The system uses three independently trained CNNs to estimate a bounding box around the heart. In this region of interest, connected components above 130 HU are considered candidates for coronary artery calcifications. To separate them from other high intensity lesions, classification of all extracted voxels is performed by feeding two-dimensional 50 mm x 50 mm patches from three orthogonal planes into three concurrent CNNs. The networks consist of three convolutional layers and one fully-connected layer with 256 neurons.
-
-                                                       In the experiments, 1028 non-contrast-enhanced and non-ECG-triggered low-dose chest CT scans were used. The network was trained on 797 scans. In the remaining 231 test scans, the method detected on average 194.3 mm3 of 199.8 mm3 coronary calcifications per scan (sensitivity 97.2%) with an average false-positive volume of 10.3 mm3. Subjects were assigned to one of five standard cardiovascular risk categories based on the Agatston score. Accuracy of risk category assignment was 84.4% with a linearly weighted kappa of 0.89.
-
-                                                       The proposed system can perform automatic coronary artery calcium scoring to identify subjects undergoing low-dose chest CT screening who are at risk of cardiovascular events with high accuracy.},
+  
+                                                         The system uses three independently trained CNNs to estimate a bounding box around the heart. In this region of interest, connected components above 130 HU are considered candidates for coronary artery calcifications. To separate them from other high intensity lesions, classification of all extracted voxels is performed by feeding two-dimensional 50 mm x 50 mm patches from three orthogonal planes into three concurrent CNNs. The networks consist of three convolutional layers and one fully-connected layer with 256 neurons.
+  
+                                                         In the experiments, 1028 non-contrast-enhanced and non-ECG-triggered low-dose chest CT scans were used. The network was trained on 797 scans. In the remaining 231 test scans, the method detected on average 194.3 mm3 of 199.8 mm3 coronary calcifications per scan (sensitivity 97.2%) with an average false-positive volume of 10.3 mm3. Subjects were assigned to one of five standard cardiovascular risk categories based on the Agatston score. Accuracy of risk category assignment was 84.4% with a linearly weighted kappa of 0.89.
+  
+                                                         The proposed system can perform automatic coronary artery calcium scoring to identify subjects undergoing low-dose chest CT screening who are at risk of cardiovascular events with high accuracy.},
   file = {Less16.pdf:pdf\\Less16.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   month = {3},
@@ -17985,22 +18013,22 @@ @conference{Lief16
   booktitle = {4th International Congress on OCT Angiography and Advances in OCT},
   year = {2016},
   abstract = {Purpose
-                                                       :
-                                                       A tool for grading micro-aneurysms and the Foveal Avascular Zone (FAZ) in Fluorescein Angiography (FA) and OCT Angiography (OCTA) has been developed. With this tool the user can compare visibility and grade micro-aneurysms by displaying early FA, late FA and inner, intermediate and outer OCTA images in a synchronized view.
-
-                                                       Methods
-                                                       :
-                                                       The user can register the images in two steps by clicking on corresponding landmarks: early and late FA should be registered, as well as early FA to OCTA. A least-squares approximation to the affine transform that best matches the annotated point sets is calculated. Visual feedback is available during this stage by blending the images that need to be registered.
-                                                       Once the images are registered, a synchronized cursor helps the user in finding and comparing micro-aneurysms in all five images. The FAZ, for which the area is automatically calculated, can be drawn onto each image as well.
-
-                                                       Results
-                                                       :
-
-                                                       Early and late FA and OCTA images, segmented into an inner, intermediate and outer layer, have been acquired for 31 eyes of 24 patients with Diabetic Macular Edema (DME). In every set of images, enough landmarks could be found for successful registration. The affine transform was sufficiently accurate to compare micro-aneurysms in the different images. The tool has been used for grading visibility and leakage of 567 micro-aneurysms. The FAZ could be delineated accurately in each image except the late FA where it was not visible.
-
-                                                       Conclusion
-                                                       :
-                                                       We developed a tool that can help researchers in comparing properties of FA and OCTA images, by registration of 5 different images (early and late FA, inner, intermediate and outer OCTA). The tool has been used for grading micro-aneurysms and delineating the FAZ for patients with DME.},
+                                                         :
+                                                         A tool for grading micro-aneurysms and the Foveal Avascular Zone (FAZ) in Fluorescein Angiography (FA) and OCT Angiography (OCTA) has been developed. With this tool the user can compare visibility and grade micro-aneurysms by displaying early FA, late FA and inner, intermediate and outer OCTA images in a synchronized view.
+  
+                                                         Methods
+                                                         :
+                                                         The user can register the images in two steps by clicking on corresponding landmarks: early and late FA should be registered, as well as early FA to OCTA. A least-squares approximation to the affine transform that best matches the annotated point sets is calculated. Visual feedback is available during this stage by blending the images that need to be registered.
+                                                         Once the images are registered, a synchronized cursor helps the user in finding and comparing micro-aneurysms in all five images. The FAZ, for which the area is automatically calculated, can be drawn onto each image as well.
+  
+                                                         Results
+                                                         :
+  
+                                                         Early and late FA and OCTA images, segmented into an inner, intermediate and outer layer, have been acquired for 31 eyes of 24 patients with Diabetic Macular Edema (DME). In every set of images, enough landmarks could be found for successful registration. The affine transform was sufficiently accurate to compare micro-aneurysms in the different images. The tool has been used for grading visibility and leakage of 567 micro-aneurysms. The FAZ could be delineated accurately in each image except the late FA where it was not visible.
+  
+                                                         Conclusion
+                                                         :
+                                                         We developed a tool that can help researchers in comparing properties of FA and OCTA images, by registration of 5 different images (early and late FA, inner, intermediate and outer OCTA). The tool has been used for grading micro-aneurysms and delineating the FAZ for patients with DME.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -18028,15 +18056,15 @@ @conference{Lief17a
   booktitle = ARVO,
   title = {Automatic detection of the foveal center in optical coherence tomography},
   abstract = {Purpose : To aautomatically detect the foveal center in optical coherence tomography (OCT) scans in order to obtain an accurate and reliable reference for the assessment of various structural biomarkers, even in the presence of large abnormalities and across different scanning protocols.
-
-                                                       Methods : 1784 OCT scans were used for the development of the proposed automatic method: 1744 scans from the European Genetic Database (EUGENDA) acquired with a Heidelberg Spectralis HRA+OCT 1 scanner and 40 scans from a publicly available dataset [1] acquired with a Bioptigen scanner. Two independent sets, with different levels of age-related macular degeneration (AMD) were drawn from the same databases for evaluation: 100 scans from EUGENDA (Set A, 25 control patients and 25 for each of the AMD severity levels early, intermediate and advanced) and 100 scans from [1] (Set B, 50 control, 50 AMD).
-                                                       A fully convolutional neural network based on stacked layers of dilated convolutions was trained to classify each pixel in a B-scan by assigning a probability of belonging to the fovea. The network was applied to every B-scan in the OCT volume, and the final foveal center was defined as the pixel with maximum assigned probability. An initial network was trained on the 1744 training scans from EUGENDA and optimized with the 40 training scans acquired with the Bioptigen scanner, to specialize for different levels of noise and contrast.
-
-                                                       For all scans manual annotations were available as reference for evaluation. The foveal center was considered correctly identified if the distance between the prediction and the reference was less than the foveal radius, i.e. 750 mm.
-
-                                                       Results : The foveal center was correctly detected in 95 OCT scans in Set A (24 control, 24 early, 25 intermediate, 22 advanced). The mean distance error was 63.7 mm with 81 detections inside a radius of 175 mm (the foveola) and 70 inside a radius of 75 mm (the umbo). In Set B, the foveal center was correctly identified in 96 OCT scans (49 control, 47 AMD). The mean distance error was 88.6 mm with 82 detections inside the foveola and 61 inside the umbo.
-
-                                                       Conclusions : The proposed automatic method performed accurately for both healthy retinas and retinas affected by AMD. The method can be applied successfully to scans from different vendors, thus providing a reliable reference location for the assessment of structural biomarkers in OCT.},
+  
+                                                         Methods : 1784 OCT scans were used for the development of the proposed automatic method: 1744 scans from the European Genetic Database (EUGENDA) acquired with a Heidelberg Spectralis HRA+OCT 1 scanner and 40 scans from a publicly available dataset [1] acquired with a Bioptigen scanner. Two independent sets, with different levels of age-related macular degeneration (AMD) were drawn from the same databases for evaluation: 100 scans from EUGENDA (Set A, 25 control patients and 25 for each of the AMD severity levels early, intermediate and advanced) and 100 scans from [1] (Set B, 50 control, 50 AMD).
+                                                         A fully convolutional neural network based on stacked layers of dilated convolutions was trained to classify each pixel in a B-scan by assigning a probability of belonging to the fovea. The network was applied to every B-scan in the OCT volume, and the final foveal center was defined as the pixel with maximum assigned probability. An initial network was trained on the 1744 training scans from EUGENDA and optimized with the 40 training scans acquired with the Bioptigen scanner, to specialize for different levels of noise and contrast.
+  
+                                                         For all scans manual annotations were available as reference for evaluation. The foveal center was considered correctly identified if the distance between the prediction and the reference was less than the foveal radius, i.e. 750 mm.
+  
+                                                         Results : The foveal center was correctly detected in 95 OCT scans in Set A (24 control, 24 early, 25 intermediate, 22 advanced). The mean distance error was 63.7 mm with 81 detections inside a radius of 175 mm (the foveola) and 70 inside a radius of 75 mm (the umbo). In Set B, the foveal center was correctly identified in 96 OCT scans (49 control, 47 AMD). The mean distance error was 88.6 mm with 82 detections inside the foveola and 61 inside the umbo.
+  
+                                                         Conclusions : The proposed automatic method performed accurately for both healthy retinas and retinas affected by AMD. The method can be applied successfully to scans from different vendors, thus providing a reliable reference location for the assessment of structural biomarkers in OCT.},
   optnote = {DIAG, RADIOLOGY},
   year = {2017},
 }
@@ -18088,17 +18116,17 @@ @conference{Lief19a
   booktitle = ARVO,
   title = {Prediction of areas at risk of developing geographic atrophy in color fundus images using deep learning},
   abstract = {Purpose:
-                                                       Exact quantification of areas of geographic atrophy (GA) can provide an important anatomical endpoint for treatment trials. The prediction of areas where GA may develop can provide valuable personalized prognosis and help in the development of targeted treatments to prevent progression and further vision loss. In this work, we present a model based on a deep convolutional neural network (CNN) that predicts the areas of GA within 5 years from baseline using color fundus (CF) images.
-
-                                                       Methods:
-                                                       Areas of GA were delineated by 4 to 5 experienced graders in consensus in 377 CF images (252 eyes) collected from the Rotterdam Study and the Blue Mountains Eye Study. Graders made use of multimodal and follow up images when available, using our EyeNED annotation workstation. We identified 84 pairs of images (baseline and follow-up) of the same eye that were acquired with an interval of approximately 5 years. Image registration was performed by identifying corresponding landmarks between the images, allowing to project the delineated GA of the follow-up image onto the baseline image.
-                                                       Next, a fully automatic segmentation model, based on a deep CNN, was developed. The CNN was trained to simultaneously segment the current GA area and the area at risk of developing GA, using only the baseline image as input. A five-fold cross-validation was performed to validate the prediction performance.
-
-                                                       Results:
-                                                       The model achieved an average dice coefficient of 0.63 for segmentation of areas at risk of developing GA in the 84 images. The intraclass correlation coefficient between the GA area defined by the consensus grading of the follow-up image and the automatically predicted area based on the baseline image was 0.54.
-
-                                                       Conclusions:
-                                                       We present a model based on a deep CNN that is capable of identifying areas where GA may develop from CF images. The proposed approach constitutes a step towards personalized prognosis and possible treatment decisions. Furthermore, the model may be used for automatic discovery of new predictive biomarkers for development and growth rate of GA, and may help to automatically identify individuals at risk of developing GA.},
+                                                         Exact quantification of areas of geographic atrophy (GA) can provide an important anatomical endpoint for treatment trials. The prediction of areas where GA may develop can provide valuable personalized prognosis and help in the development of targeted treatments to prevent progression and further vision loss. In this work, we present a model based on a deep convolutional neural network (CNN) that predicts the areas of GA within 5 years from baseline using color fundus (CF) images.
+  
+                                                         Methods:
+                                                         Areas of GA were delineated by 4 to 5 experienced graders in consensus in 377 CF images (252 eyes) collected from the Rotterdam Study and the Blue Mountains Eye Study. Graders made use of multimodal and follow up images when available, using our EyeNED annotation workstation. We identified 84 pairs of images (baseline and follow-up) of the same eye that were acquired with an interval of approximately 5 years. Image registration was performed by identifying corresponding landmarks between the images, allowing to project the delineated GA of the follow-up image onto the baseline image.
+                                                         Next, a fully automatic segmentation model, based on a deep CNN, was developed. The CNN was trained to simultaneously segment the current GA area and the area at risk of developing GA, using only the baseline image as input. A five-fold cross-validation was performed to validate the prediction performance.
+  
+                                                         Results:
+                                                         The model achieved an average dice coefficient of 0.63 for segmentation of areas at risk of developing GA in the 84 images. The intraclass correlation coefficient between the GA area defined by the consensus grading of the follow-up image and the automatically predicted area based on the baseline image was 0.54.
+  
+                                                         Conclusions:
+                                                         We present a model based on a deep CNN that is capable of identifying areas where GA may develop from CF images. The proposed approach constitutes a step towards personalized prognosis and possible treatment decisions. Furthermore, the model may be used for automatic discovery of new predictive biomarkers for development and growth rate of GA, and may help to automatically identify individuals at risk of developing GA.},
   optnote = {DIAG, RADIOLOGY},
   year = {2019},
   all_ss_ids = {[6d1ea27b41023f9add67e2c8c4dcbc7866ae640b]},
@@ -18124,25 +18152,25 @@ @article{Lief20
   doi = {10.1016/j.ophtha.2020.02.009},
   url = {https://arxiv.org/abs/1908.05621},
   abstract = {PURPOSE:
-                                                      To develop and validate a deep learning model for the automatic segmentation of geographic atrophy (GA) using color fundus images (CFIs) and its application to study the growth rate of GA.
-
-                                                      DESIGN:
-                                                      Prospective, multicenter, natural history study with up to 15 years of follow-up.
-
-                                                      PARTICIPANTS:
-                                                      Four hundred nine CFIs of 238 eyes with GA from the Rotterdam Study (RS) and Blue Mountain Eye Study (BMES) for model development, and 3589 CFIs of 376 eyes from the Age-Related Eye Disease Study (AREDS) for analysis of GA growth rate.
-
-                                                      METHODS:
-                                                      Deep learning model based on an ensemble of encoder-decoder architectures was implemented and optimized for the segmentation of GA in CFIs. Four experienced graders delineated, in consensus, GA in CFIs from the RS and BMES. These manual delineations were used to evaluate the segmentation model using 5-fold cross-validation. The model was applied further to CFIs from the AREDS to study the growth rate of GA. Linear regression analysis was used to study associations between structural biomarkers at baseline and the GA growth rate. A general estimate of the progression of GA area over time was made by combining growth rates of all eyes with GA from the AREDS set.
-
-                                                      MAIN OUTCOME MEASURES:
-                                                      Automatically segmented GA and GA growth rate.
-
-                                                      RESULTS:
-                                                      The model obtained an average Dice coefficient of 0.72+-0.26 on the BMES and RS set while comparing the automatically segmented GA area with the graders' manual delineations. An intraclass correlation coefficient of 0.83 was reached between the automatically estimated GA area and the graders' consensus measures. Nine automatically calculated structural biomarkers (area, filled area, convex area, convex solidity, eccentricity, roundness, foveal involvement, perimeter, and circularity) were significantly associated with growth rate. Combining all growth rates indicated that GA area grows quadratically up to an area of approximately 12 mm2, after which growth rate stabilizes or decreases.
-
-                                                      CONCLUSIONS:
-                                                      The deep learning model allowed for fully automatic and robust segmentation of GA on CFIs. These segmentations can be used to extract structural characteristics of GA that predict its growth rate.},
+                                                        To develop and validate a deep learning model for the automatic segmentation of geographic atrophy (GA) using color fundus images (CFIs) and its application to study the growth rate of GA.
+  
+                                                        DESIGN:
+                                                        Prospective, multicenter, natural history study with up to 15 years of follow-up.
+  
+                                                        PARTICIPANTS:
+                                                        Four hundred nine CFIs of 238 eyes with GA from the Rotterdam Study (RS) and Blue Mountain Eye Study (BMES) for model development, and 3589 CFIs of 376 eyes from the Age-Related Eye Disease Study (AREDS) for analysis of GA growth rate.
+  
+                                                        METHODS:
+                                                        Deep learning model based on an ensemble of encoder-decoder architectures was implemented and optimized for the segmentation of GA in CFIs. Four experienced graders delineated, in consensus, GA in CFIs from the RS and BMES. These manual delineations were used to evaluate the segmentation model using 5-fold cross-validation. The model was applied further to CFIs from the AREDS to study the growth rate of GA. Linear regression analysis was used to study associations between structural biomarkers at baseline and the GA growth rate. A general estimate of the progression of GA area over time was made by combining growth rates of all eyes with GA from the AREDS set.
+  
+                                                        MAIN OUTCOME MEASURES:
+                                                        Automatically segmented GA and GA growth rate.
+  
+                                                        RESULTS:
+                                                        The model obtained an average Dice coefficient of 0.72+-0.26 on the BMES and RS set while comparing the automatically segmented GA area with the graders' manual delineations. An intraclass correlation coefficient of 0.83 was reached between the automatically estimated GA area and the graders' consensus measures. Nine automatically calculated structural biomarkers (area, filled area, convex area, convex solidity, eccentricity, roundness, foveal involvement, perimeter, and circularity) were significantly associated with growth rate. Combining all growth rates indicated that GA area grows quadratically up to an area of approximately 12 mm2, after which growth rate stabilizes or decreases.
+  
+                                                        CONCLUSIONS:
+                                                        The deep learning model allowed for fully automatic and robust segmentation of GA on CFIs. These segmentations can be used to extract structural characteristics of GA that predict its growth rate.},
   file = {Lief20.pdf:pdf\\Lief20.pdf:PDF},
   journal = Ophthalmology,
   volume = {127},
@@ -18164,15 +18192,15 @@ @conference{Lief20a
   url = {https://www.euretina.org/congress/amsterdam-2020/virtual-2020-freepapers/},
   title = {Achieving expert level performance in quantifying 13 distinctive features of neovascular age-related macular degeneration on optical coherence tomography},
   abstract = {Purpose:
-                              				To develop and validate an automatic model for volumetric quantification of the 13 most common abnormalities associated with neovascular age-related macular degeneration (nAMD) on optical coherence tomography (OCT).
-                              				Setting:
-                              				Clinical data and associated imaging were collected from five UK secondary care providers between February 2002 and September 2017. We identified 680 treatment-naive patients with no recent cataract surgery, at least one anti-VEGF injection, a diagnosis of nAMD, and associated OCT imaging (Topcon, Tokyo, Japan).
-                              				Methods:
-                              				A deep convolutional neural network (CNN) was used to produce a volumetric segmentation of 13 retinal abnormalities. The CNN architecture was based on a deep encoder-decoder structure that combines information from adjacent B-scans. The model was trained on 2,712 B-scans from 307 OCT volumes, with manual labels provided at a voxel-level for all abnormalities by eight graders. Abnormalities that were found in over 80 B-scans were modelled. The performance of the model and graders was assessed on an independent set of 112 B-scans from 112 OCT-volumes of nAMD cases, for which four graders independently provided annotations. To create a reference standard, the outputs of three graders were combined and defined as voxels where at least two out of three agreed. The graders' accuracy was calculated using each grader, in turn, as an observer. The Dice similarity metric was used to compare overlap, calculated per A-scan or per voxel where appropriate. Free-response receiver operator characteristic (FROC) analysis was used for the detection of small abnormalities. The intraclass correlation coefficient (ICC) was used to measure agreement on area or volume measures, with the reference area or volume defined as the average of the three graders.
-                              				Results:
-                              				Included abnormalities were: intraretinal fluid (IRF), subretinal fluid (SRF), pigment epithelial detachment (PED), subretinal hyperreflective material (SHRM), fibrosis, drusen and drusenoid PED, epiretinal membrane (ERM), outer plexiform layer (OPL) descent, ellipsoid loss, retinal pigment epithelium (RPE) loss or attenuation, hyper-transmission, hyperreflective dots and subretinal drusenoid deposits - reticular pseudodrusen (SDD - RPD). For OPL-descent and fibrosis there were insufficient examples in the test set for a reliable performance estimate.For the other features, the model obtained an average Dice score of 0.63 +- 0.15 (median 0.64), compared to 0.61 +- 0.17 (median 0.60) for the observers. The average ICC for the model was 0.66 +- 0.22 (median 0.69), compared to 0.62 +- 0.21 (median 0.55) for the observers. For individual features, differences between model and observer Dice score were within a 95% confidence interval for all features except ellipsoid loss, where model performance was slightly better (p=0.03). Regarding ICC, model performance was slightly better for IRF (p=0.04) and ellipsoid loss (p=0.006), slightly worse for drusen and drusenoid PED (p=0.03), and within the 95% confidence interval for other features. For hyperreflective dots and SDD-RPD, FROC analysis revealed that the model performed at similar sensitivity per false positives as the observers.
-                              				Conclusions:
-                              				We present a deep-learning based model that provides accurate volumetric quantification of a comprehensive set of relevant pathological components of nAMD. There was relatively large variability in grader agreement between abnormalities. Nevertheless, model performance was comparable to, and in many cases exceeded, human performance, both in terms of overlap and quantification. The model generates a precise, quantitative morphological signature of the retinal pathology that can facilitate the development of prediction models for treatment response and planning of personalized treatment intervals, as well as further research into structure/function correlation. In clinical care it can facilitate structured reporting, reducing subjectivity in clinicians' assessments and enabling implementation of refined treatment guidelines.The presented model accelerates interpretation of OCT volumes and surpasses manual reading, both in terms of attainable level of extracted information and consistency. This can potentially lead to a reduction of costs in interpretation of clinical trials and improve personalized clinical care.},
+                                				To develop and validate an automatic model for volumetric quantification of the 13 most common abnormalities associated with neovascular age-related macular degeneration (nAMD) on optical coherence tomography (OCT).
+                                				Setting:
+                                				Clinical data and associated imaging were collected from five UK secondary care providers between February 2002 and September 2017. We identified 680 treatment-naive patients with no recent cataract surgery, at least one anti-VEGF injection, a diagnosis of nAMD, and associated OCT imaging (Topcon, Tokyo, Japan).
+                                				Methods:
+                                				A deep convolutional neural network (CNN) was used to produce a volumetric segmentation of 13 retinal abnormalities. The CNN architecture was based on a deep encoder-decoder structure that combines information from adjacent B-scans. The model was trained on 2,712 B-scans from 307 OCT volumes, with manual labels provided at a voxel-level for all abnormalities by eight graders. Abnormalities that were found in over 80 B-scans were modelled. The performance of the model and graders was assessed on an independent set of 112 B-scans from 112 OCT-volumes of nAMD cases, for which four graders independently provided annotations. To create a reference standard, the outputs of three graders were combined and defined as voxels where at least two out of three agreed. The graders' accuracy was calculated using each grader, in turn, as an observer. The Dice similarity metric was used to compare overlap, calculated per A-scan or per voxel where appropriate. Free-response receiver operator characteristic (FROC) analysis was used for the detection of small abnormalities. The intraclass correlation coefficient (ICC) was used to measure agreement on area or volume measures, with the reference area or volume defined as the average of the three graders.
+                                				Results:
+                                				Included abnormalities were: intraretinal fluid (IRF), subretinal fluid (SRF), pigment epithelial detachment (PED), subretinal hyperreflective material (SHRM), fibrosis, drusen and drusenoid PED, epiretinal membrane (ERM), outer plexiform layer (OPL) descent, ellipsoid loss, retinal pigment epithelium (RPE) loss or attenuation, hyper-transmission, hyperreflective dots and subretinal drusenoid deposits - reticular pseudodrusen (SDD - RPD). For OPL-descent and fibrosis there were insufficient examples in the test set for a reliable performance estimate.For the other features, the model obtained an average Dice score of 0.63 +- 0.15 (median 0.64), compared to 0.61 +- 0.17 (median 0.60) for the observers. The average ICC for the model was 0.66 +- 0.22 (median 0.69), compared to 0.62 +- 0.21 (median 0.55) for the observers. For individual features, differences between model and observer Dice score were within a 95% confidence interval for all features except ellipsoid loss, where model performance was slightly better (p=0.03). Regarding ICC, model performance was slightly better for IRF (p=0.04) and ellipsoid loss (p=0.006), slightly worse for drusen and drusenoid PED (p=0.03), and within the 95% confidence interval for other features. For hyperreflective dots and SDD-RPD, FROC analysis revealed that the model performed at similar sensitivity per false positives as the observers.
+                                				Conclusions:
+                                				We present a deep-learning based model that provides accurate volumetric quantification of a comprehensive set of relevant pathological components of nAMD. There was relatively large variability in grader agreement between abnormalities. Nevertheless, model performance was comparable to, and in many cases exceeded, human performance, both in terms of overlap and quantification. The model generates a precise, quantitative morphological signature of the retinal pathology that can facilitate the development of prediction models for treatment response and planning of personalized treatment intervals, as well as further research into structure/function correlation. In clinical care it can facilitate structured reporting, reducing subjectivity in clinicians' assessments and enabling implementation of refined treatment guidelines.The presented model accelerates interpretation of OCT volumes and surpasses manual reading, both in terms of attainable level of extracted information and consistency. This can potentially lead to a reduction of costs in interpretation of clinical trials and improve personalized clinical care.},
   optnote = {DIAG, RADIOLOGY},
   year = {2020},
   month = {9},
@@ -18182,19 +18210,19 @@ @article{Lief21
   title = {Quantification of key retinal features in early and late age-related macular degeneration using deep learning},
   author = {Liefers, Bart and Taylor, Paul and Alsaedi, Abdulrahman and Bailey, Clare and Balaskas, Konstantinos and Dhingra, Narendra and Egan, Catherine A and Rodrigues, Filipa Gomes and Gonz\'{a}lez-Gonzalo, Cristina and Heeren, Tjebo F.C. and Lotery, Andrew and Muller, Philipp L. and Olvera-Barrios, Abraham and Paul, Bobby and Schwartz, Roy and Thomas, Darren S. and Warwick, Alasdair N. and Tufail, Adnan and S\'{a}nchez, Clara I.},
   abstract = {Purpose:
-                              				To develop and validate a deep learning model for segmentation of 13 features associated with neovascular and atrophic age-related macular degeneration (AMD).
-
-                              				Design:
-                              				Development and validation of a deep-learning model for feature segmentation.
-
-                              				Methods:
-                              				Data for model development were obtained from 307 optical coherence tomography volumes. Eight experienced graders manually delineated all abnormalities in 2,712 B-scans. A deep neural network was trained with this data to perform voxel-level segmentation of the 13 most common abnormalities (features). For evaluation, 112 B-scans from 112 patients with a diagnosis of neovascular AMD were annotated by four independent observers. Main outcome measures were Dice score, intra-class correlation coefficient (ICC), and free-response receiver operating characteristic (FROC) curve.
-
-                              				Results:
-                              				On 11 of the 13 features, the model obtained a mean Dice score of 0.63 +- 0.15, compared to 0.61 +- 0.17 for the observers. The mean ICC for the model was 0.66 +- 0.22, compared to 0.62 +- 0.21 for the observers. Two features were not evaluated quantitatively due to lack of data. FROC analysis demonstrated that the model scored similar or higher sensitivity per false positives compared to the observers.
-
-                              				Conclusions:
-                              				The quality of the automatic segmentation matches that of experienced graders for most features, exceeding human performance for some features. The quantified parameters provided by the model can be used in the current clinical routine and open possibilities for further research into treatment response outside clinical trials.},
+                                				To develop and validate a deep learning model for segmentation of 13 features associated with neovascular and atrophic age-related macular degeneration (AMD).
+  
+                                				Design:
+                                				Development and validation of a deep-learning model for feature segmentation.
+  
+                                				Methods:
+                                				Data for model development were obtained from 307 optical coherence tomography volumes. Eight experienced graders manually delineated all abnormalities in 2,712 B-scans. A deep neural network was trained with this data to perform voxel-level segmentation of the 13 most common abnormalities (features). For evaluation, 112 B-scans from 112 patients with a diagnosis of neovascular AMD were annotated by four independent observers. Main outcome measures were Dice score, intra-class correlation coefficient (ICC), and free-response receiver operating characteristic (FROC) curve.
+  
+                                				Results:
+                                				On 11 of the 13 features, the model obtained a mean Dice score of 0.63 +- 0.15, compared to 0.61 +- 0.17 for the observers. The mean ICC for the model was 0.66 +- 0.22, compared to 0.62 +- 0.21 for the observers. Two features were not evaluated quantitatively due to lack of data. FROC analysis demonstrated that the model scored similar or higher sensitivity per false positives compared to the observers.
+  
+                                				Conclusions:
+                                				The quality of the automatic segmentation matches that of experienced graders for most features, exceeding human performance for some features. The quantified parameters provided by the model can be used in the current clinical routine and open possibilities for further research into treatment response outside clinical trials.},
   journal = AJO,
   doi = {https://doi.org/10.1016/j.ajo.2020.12.034},
   url = {https://www.sciencedirect.com/science/article/abs/pii/S0002939421000088},
@@ -18214,12 +18242,12 @@ @phdthesis{Lief22
   title = {Deep Learning Algorithms for Age-Related Macular Degeneration},
   url = {https://repository.ubn.ru.nl/handle/2066/252875},
   abstract = {This thesis is devoted to the applications of deep learning algorithms for automated analysis of retinal images.
-                             In contains chapters on:
-
-                             1. Automatic detection of the foveal center in OCT scans (Chapter 2);
-                             2. Segmentation of retinal layers and geographic atrophy (Chapter 3);
-                             3. Segmentation of geographic atrophy on color fundus (Chapter 4);
-                             4. Quantification of key retinal features in early and late AMD. (Chapter 5).},
+                               In contains chapters on:
+  
+                               1. Automatic detection of the foveal center in OCT scans (Chapter 2);
+                               2. Segmentation of retinal layers and geographic atrophy (Chapter 3);
+                               3. Segmentation of geographic atrophy on color fundus (Chapter 4);
+                               4. Quantification of key retinal features in early and late AMD. (Chapter 5).},
   copromotor = {T. Theelen},
   file = {Lief22.pdf:pdf/Lief22.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -18759,8 +18787,8 @@ @article{Litj18
   pages = {1--8},
   doi = {10.1093/gigascience/giy065},
   abstract = {Background: The presence of lymph node metastases is one of the most important factors in breast cancer prognosis. The most common strategy to assess the regional lymph node status is the sentinel lymph node procedure. The sentinel lymph node is the most likely lymph node to contain metastasized cancer cells and is excised, histopathologically processed and examined by the pathologist. This tedious examination process is time-consuming and can lead to small metastases being missed. However, recent advances in whole-slide imaging and machine learning have opened an avenue for analysis of digitized lymph node sections with computer algorithms. For example, convolutional neural networks, a type of machine learning algorithm, are able to automatically detect cancer metastases in lymph nodes with high accuracy. To train machine learning models, large, well-curated datasets are needed.
-                                                       Results: We released a dataset of 1399 annotated whole-slide images of lymph nodes, both with and without metastases, in total three terabytes of data in the context of the CAMELYON16 and CAMELYON17 Grand Challenges. Slides were collected from five different medical centers to cover a broad range of image appearance and staining variations. Each whole-slide image has a slide-level label indicating whether it contains no metastases, macro-metastases, micro-metastases or isolated tumor cells. Furthermore, for 209 whole-slide images, detailed hand-drawn contours for all metastases are provided. Last, open-source software tools to visualize and interact with the data have been made available.
-                                                       Conclusions: A unique dataset of annotated, whole-slide digital histopathology images has been provided with high potential for re-use.},
+                                                         Results: We released a dataset of 1399 annotated whole-slide images of lymph nodes, both with and without metastases, in total three terabytes of data in the context of the CAMELYON16 and CAMELYON17 Grand Challenges. Slides were collected from five different medical centers to cover a broad range of image appearance and staining variations. Each whole-slide image has a slide-level label indicating whether it contains no metastases, macro-metastases, micro-metastases or isolated tumor cells. Furthermore, for 209 whole-slide images, detailed hand-drawn contours for all metastases are provided. Last, open-source software tools to visualize and interact with the data have been made available.
+                                                         Conclusions: A unique dataset of annotated, whole-slide digital histopathology images has been provided with high potential for re-use.},
   file = {:pdf/Litj18.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   number = {6},
@@ -18933,22 +18961,22 @@ @article{Lohu24
   doi = {10.1007/s00330-024-10771-y},
   url = {http://dx.doi.org/10.1007/s00330-024-10771-y},
   abstract = {Abstract
-                     Objective
-                     Deep learning (DL) MRI reconstruction enables fast scan acquisition with good visual quality, but the diagnostic impact is often not assessed because of large reader study requirements. This study used existing diagnostic DL to assess the diagnostic quality of reconstructed images.
-
-                     Materials and methods
-                     A retrospective multisite study of 1535 patients assessed biparametric prostate MRI between 2016 and 2020. Likely clinically significant prostate cancer (csPCa) lesions (PI-RADS <jats:inline-formula><jats:tex-math>$$\ge$$</jats:tex-math><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML">
-                         <mml:mo>>=</mml:mo>
-                       </mml:math></jats:inline-formula> 4) were delineated by expert radiologists. T2-weighted scans were retrospectively undersampled, simulating accelerated protocols. DL reconstruction (DLRecon) and diagnostic DL detection (DLDetect) were developed. The effect on the partial area under (pAUC), the Free-Response Operating Characteristic (FROC) curve, and the structural similarity (SSIM) were compared as metrics for diagnostic and visual quality, respectively. DLDetect was validated with a reader concordance analysis. Statistical analysis included Wilcoxon, permutation, and Cohen's kappa tests for visual quality, diagnostic performance, and reader concordance.
-
-                     Results
-                     DLRecon improved visual quality at 4- and 8-fold (R4, R8) subsampling rates, with SSIM (range: -1 to 1) improved to 0.78 +- 0.02 (p &lt; 0.001) and 0.67 +- 0.03 (p &lt; 0.001) from 0.68 +- 0.03 and 0.51 +- 0.03, respectively. However, diagnostic performance at R4 showed a pAUC FROC of 1.33 (CI 1.28-1.39) for DL and 1.29 (CI 1.23-1.35) for naive reconstructions, both significantly lower than fully sampled pAUC of 1.58 (DL: p = 0.024, na\"{i}ve: p = 0.02). Similar trends were noted for R8.
-
-                     Conclusion
-                     DL reconstruction produces visually appealing images but may reduce diagnostic accuracy. Incorporating diagnostic AI into the assessment framework offers a clinically relevant metric essential for adopting reconstruction models into clinical practice.
-
-                     Clinical relevance statement
-                     In clinical settings, caution is warranted when using DL reconstruction for MRI scans. While it recovered visual quality, it failed to match the prostate cancer detection rates observed in scans not subjected to acceleration and DL reconstruction.},
+                       Objective
+                       Deep learning (DL) MRI reconstruction enables fast scan acquisition with good visual quality, but the diagnostic impact is often not assessed because of large reader study requirements. This study used existing diagnostic DL to assess the diagnostic quality of reconstructed images.
+  
+                       Materials and methods
+                       A retrospective multisite study of 1535 patients assessed biparametric prostate MRI between 2016 and 2020. Likely clinically significant prostate cancer (csPCa) lesions (PI-RADS <jats:inline-formula><jats:tex-math>$$\ge$$</jats:tex-math><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML">
+                           <mml:mo>>=</mml:mo>
+                         </mml:math></jats:inline-formula> 4) were delineated by expert radiologists. T2-weighted scans were retrospectively undersampled, simulating accelerated protocols. DL reconstruction (DLRecon) and diagnostic DL detection (DLDetect) were developed. The effect on the partial area under (pAUC), the Free-Response Operating Characteristic (FROC) curve, and the structural similarity (SSIM) were compared as metrics for diagnostic and visual quality, respectively. DLDetect was validated with a reader concordance analysis. Statistical analysis included Wilcoxon, permutation, and Cohen's kappa tests for visual quality, diagnostic performance, and reader concordance.
+  
+                       Results
+                       DLRecon improved visual quality at 4- and 8-fold (R4, R8) subsampling rates, with SSIM (range: -1 to 1) improved to 0.78 +- 0.02 (p &lt; 0.001) and 0.67 +- 0.03 (p &lt; 0.001) from 0.68 +- 0.03 and 0.51 +- 0.03, respectively. However, diagnostic performance at R4 showed a pAUC FROC of 1.33 (CI 1.28-1.39) for DL and 1.29 (CI 1.23-1.35) for naive reconstructions, both significantly lower than fully sampled pAUC of 1.58 (DL: p = 0.024, na\"{i}ve: p = 0.02). Similar trends were noted for R8.
+  
+                       Conclusion
+                       DL reconstruction produces visually appealing images but may reduce diagnostic accuracy. Incorporating diagnostic AI into the assessment framework offers a clinically relevant metric essential for adopting reconstruction models into clinical practice.
+  
+                       Clinical relevance statement
+                       In clinical settings, caution is warranted when using DL reconstruction for MRI scans. While it recovered visual quality, it failed to match the prostate cancer detection rates observed in scans not subjected to acceleration and DL reconstruction.},
   all_ss_ids = {['cd9381e9ad0b40cc225ac040c41e76b30dbade7c']},
   automatic = {yes},
   citation-count = {0},
@@ -18964,9 +18992,9 @@ @conference{Lohuizen23a
   booktitle = RSNA,
   title = {Diagnostic AI to speed up MRI protocols by identifying redundant sequences: are all diffusion-weighted prostate MRI sequences necessary?},
   abstract = {PURPOSE: Studies showed that AI reconstruction of accelerated MRI improves visual quality, but it is unclear whether this improves diagnostic value. We investigated a novel framework for accelerated MRI by assessing reconstruction quality (naive- vs AI-reconstructed) by comparing diagnostic performance and visual similarity as an outcome for prostate cancer detection.
-                          METHODS: A retrospective multi-site study was performed on a cohort of 1535 patients who underwent bi-parametric prostate MRI between 2016-2020. An expert radiologist delineated all clinically significant prostate cancer (csPCa) lesions (PI-RADS >= 4). T2W scans were retrospectively undersampled in k-space, simulating four (R4) and eight (R8) times acceleration. A 3D U-Net was used to reconstruct undersampled images. The resulting images were fed to an existing state-of-the-art csPCa detection AI to evaluate the effect of AI reconstruction on diagnosis. Visual image quality (SSIM) was compared with a Wilcoxon test. Lesion level diagnostics were evaluated by comparing the partial area-under-the-FROC-curve over the false positive interval 0.1-2.5 (pAUC) using permutation tests.
-                          RESULTS: AI-based reconstruction significantly improved visual quality compared to naive (IFFT) reconstruction MRI at R4 (SSIM 0.78+-0.02 vs 0.68+-0.03, p<0.001) and R8 (SSIM 0.67+-0.03 vs 0.51+-0.03, p<0.001), however, no significant improvements in diagnostic performance were observed for R4 (pAUC FROC 1.33 [CI 1.28-1.39] vs 1.29 [CI 1.23-1.35], p=0.37), nor R8 (pAUC FROC 1.12 [CI 1.07-1.17] vs 0.95 [CI 1.89-1.01], p=0.067). AI-based reconstruction resulted in 0.1 or more decrease in sensitivity compared to unaccelerated MRI.
-                          CONCLUSION: Recovery of visual quality in reconstructions does not correlate with recovering diagnostic quality, emphasizing the importance of measuring diagnostic value rather than visual similarity. AI reconstruction tools should be approached with care because they might have been optimized to reconstruct visually appealing images instead of diagnostic images.},
+                            METHODS: A retrospective multi-site study was performed on a cohort of 1535 patients who underwent bi-parametric prostate MRI between 2016-2020. An expert radiologist delineated all clinically significant prostate cancer (csPCa) lesions (PI-RADS >= 4). T2W scans were retrospectively undersampled in k-space, simulating four (R4) and eight (R8) times acceleration. A 3D U-Net was used to reconstruct undersampled images. The resulting images were fed to an existing state-of-the-art csPCa detection AI to evaluate the effect of AI reconstruction on diagnosis. Visual image quality (SSIM) was compared with a Wilcoxon test. Lesion level diagnostics were evaluated by comparing the partial area-under-the-FROC-curve over the false positive interval 0.1-2.5 (pAUC) using permutation tests.
+                            RESULTS: AI-based reconstruction significantly improved visual quality compared to naive (IFFT) reconstruction MRI at R4 (SSIM 0.78+-0.02 vs 0.68+-0.03, p<0.001) and R8 (SSIM 0.67+-0.03 vs 0.51+-0.03, p<0.001), however, no significant improvements in diagnostic performance were observed for R4 (pAUC FROC 1.33 [CI 1.28-1.39] vs 1.29 [CI 1.23-1.35], p=0.37), nor R8 (pAUC FROC 1.12 [CI 1.07-1.17] vs 0.95 [CI 1.89-1.01], p=0.067). AI-based reconstruction resulted in 0.1 or more decrease in sensitivity compared to unaccelerated MRI.
+                            CONCLUSION: Recovery of visual quality in reconstructions does not correlate with recovering diagnostic quality, emphasizing the importance of measuring diagnostic value rather than visual similarity. AI reconstruction tools should be approached with care because they might have been optimized to reconstruct visually appealing images instead of diagnostic images.},
   optnote = {DIAG, RADIOLOGY},
   year = {2023},
 }
@@ -18996,10 +19024,10 @@ @conference{Loma23
   booktitle = {MIDL},
   title = {Interactive Cell Detection in H&E-stained slides of Diffuse Gastric Cancer},
   abstract = {We present an interactive detection model to improve the cell annotation workflow of diffuse gastric cancer.
-                          The model relates image and user inputs and is trained to detect three types of cells in diffuse gastric cancer histology.
-                          We measure model multi-class cell detection performance as per-class F1 score and we show that it increases with the number of user input clicks.
-                          Moreover, we show that the proposed interactive annotation approach substantially reduces the number of required user actions needed for complete image annotation, achieving a 17\% reduction for the multi-class case.
-                          Future work will implement an iterative approach to filter out recurring false positives for further performance improvement.},
+                            The model relates image and user inputs and is trained to detect three types of cells in diffuse gastric cancer histology.
+                            We measure model multi-class cell detection performance as per-class F1 score and we show that it increases with the number of user input clicks.
+                            Moreover, we show that the proposed interactive annotation approach substantially reduces the number of required user actions needed for complete image annotation, achieving a 17\% reduction for the multi-class case.
+                            Future work will implement an iterative approach to filter out recurring false positives for further performance improvement.},
   optnote = {DIAG, PATHOLOGY},
   year = {2023},
 }
@@ -19009,16 +19037,16 @@ @conference{Loma23a
   booktitle = {European Congress of Pathology},
   title = {Deep learning for multi-class cell detection in H&E-stained slides of diffuse gastric cancer},
   abstract = {Background & objective
-                          Diffuse gastric cancer (DGC) is characterized by poorly cohesive cells which are difficult to detect. We propose the first deep learning model to detect classical signet ring cells (SRCs), atypical SRCs, and poorly differentiated cells in H&E-stained slides of DGC.
-
-                          Methods
-                          We collected slides from 9 patients with hereditary DGC, resulting in 105 and 3 whole-slide images (WSIs) of gastric resections and biopsies, respectively. The three target cell types were annotated, resulting in 24,695 cell-level annotations. We trained a deep learning model with the Faster-RCNN architecture using 99 WSIs in the development set.
-
-                          Results
-                          The algorithm was tested on 9 WSIs in the independent validation set. Model predictions were counted as correct if they were within a 15-micron radius from the expert reference annotations. For evaluation, we split the detection task into two components: class-independent cell localization (recognition of any tumor cell type) and cell-type classification (categorizing localized cells as the correct types). We found (average) F1 scores of 0.69 and 0.93 for the localization and classification tasks, respectively. Thus, we observe that the algorithm does not generally misclassify cells, but rather, the errors mainly arise from missing cells or false positive predictions of cells that do not belong to the three target classes.
-
-                          Conclusion
-                          Future work will focus on improving the cell localization performance of the algorithm. Cell localization of the three target classes will be an important task in a clinical application of our model, in which it could be used to improve the detection of DGC lesions among large sets of slides. Moreover, the algorithm will allow for quantitative assessment of DGC patterns, potentially giving new insights in specific morphological features of DGC such as patterns of spatial cell distributions.},
+                            Diffuse gastric cancer (DGC) is characterized by poorly cohesive cells which are difficult to detect. We propose the first deep learning model to detect classical signet ring cells (SRCs), atypical SRCs, and poorly differentiated cells in H&E-stained slides of DGC.
+  
+                            Methods
+                            We collected slides from 9 patients with hereditary DGC, resulting in 105 and 3 whole-slide images (WSIs) of gastric resections and biopsies, respectively. The three target cell types were annotated, resulting in 24,695 cell-level annotations. We trained a deep learning model with the Faster-RCNN architecture using 99 WSIs in the development set.
+  
+                            Results
+                            The algorithm was tested on 9 WSIs in the independent validation set. Model predictions were counted as correct if they were within a 15-micron radius from the expert reference annotations. For evaluation, we split the detection task into two components: class-independent cell localization (recognition of any tumor cell type) and cell-type classification (categorizing localized cells as the correct types). We found (average) F1 scores of 0.69 and 0.93 for the localization and classification tasks, respectively. Thus, we observe that the algorithm does not generally misclassify cells, but rather, the errors mainly arise from missing cells or false positive predictions of cells that do not belong to the three target classes.
+  
+                            Conclusion
+                            Future work will focus on improving the cell localization performance of the algorithm. Cell localization of the three target classes will be an important task in a clinical application of our model, in which it could be used to improve the detection of DGC lesions among large sets of slides. Moreover, the algorithm will allow for quantitative assessment of DGC patterns, potentially giving new insights in specific morphological features of DGC such as patterns of spatial cell distributions.},
   optnote = {DIAG, PATHOLOGY},
   year = {2023},
 }
@@ -19209,19 +19237,21 @@ @inproceedings{Lope03
   gscites = {4},
 }
 
-@article{Lotz21,
+@article{Lotz23,
   author = {Lotz, Johannes and Weiss, Nick and van der Laak, Jeroen and Heldmann, Stefan},
-  title = {Comparison of Consecutive and Re-stained Sections for Image Registration in Histopathology},
-  doi = {10.48550/ARXIV.2106.13150},
-  year = {2021},
-  abstract = {Purpose: In digital histopathology, virtual multi-staining is important for diagnosis and biomarker research. Additionally, it provides accurate ground-truth for various deep-learning tasks. Virtual multi-staining can be obtained using different stains for consecutive sections or by re-staining the same section. Both approaches require image registration to compensate tissue deformations, but little attention has been devoted to comparing their accuracy. Approach: We compare variational image registration of consecutive and re-stained sections and analyze the effect of the image resolution which influences accuracy and required computational resources. We present a new hybrid dataset of re-stained and consecutive sections (HyReCo, 81 slide pairs, approx. 3000 landmarks) that we made publicly available and compare its image registration results to the automatic non-rigid histological image registration (ANHIR) challenge data (230 consecutive slide pairs). Results: We obtain a median landmark error after registration of 7.1 mm (HyReCo) and 16.0 mm (ANHIR) between consecutive sections. Between re-stained sections, the median registration error is 2.3 mm and 0.9 mm in the two subsets of the HyReCo dataset. We observe that deformable registration leads to lower landmark errors than affine registration in both cases, though the effect is smaller in re-stained sections. Conclusion: Deformable registration of consecutive and re-stained sections is a valuable tool for the joint analysis of different stains. Significance: While the registration of re-stained sections allows nucleus-level alignment which allows for a direct analysis of interacting biomarkers, consecutive sections only allow the transfer of region-level annotations. The latter can be achieved at low computational cost using coarser image resolutions.},
-  url = {https://arxiv.org/abs/2106.13150},
-  file = {Lotz21.pdf:pdf\\Lotz21.pdf:PDF},
+  title = {Comparison of consecutive and restained sections for image registration in histopathology},
+  doi = {10.1117/1.jmi.10.6.067501},
+  year = {2023},
+  abstract = {SignificanceAlthough the registration of restained sections allows nucleus-level alignment that enables a direct analysis of interacting biomarkers, consecutive sections only allow the transfer of region-level annotations. The latter can be achieved at low computational cost using coarser image resolutions.PurposeIn digital histopathology, virtual multistaining is important for diagnosis and biomarker research. Additionally, it provides accurate ground truth for various deep-learning tasks. Virtual multistaining can be obtained using different stains for consecutive sections or by restaining the same section. Both approaches require image registration to compensate for tissue deformations, but little attention has been devoted to comparing their accuracy.ApproachWe compared affine and deformable variational image registration of consecutive and restained sections and analyzed the effect of the image resolution that influences accuracy and required computational resources. The registration was applied to the automatic nonrigid histological image registration (ANHIR) challenge data (230 consecutive slide pairs) and the hyperparameters were determined. Then without changing the parameters, the registration was applied to a newly published hybrid dataset of restained and consecutive sections (HyReCo, 86 slide pairs, 5404 landmarks).ResultsWe obtain a median landmark error after registration of 6.5 mm (HyReCo) and 24.1 mm (ANHIR) between consecutive sections. Between restained sections, the median registration error is 2.2 and 0.9 mm in the two subsets of the HyReCo dataset. We observe that deformable registration leads to lower landmark errors than affine registration in both cases (p < 0.001), though the effect is smaller in restained sections.ConclusionDeformable registration of consecutive and restained sections is a valuable tool for the joint analysis of different stains.},
+  url = {http://dx.doi.org/10.1117/1.jmi.10.6.067501},
+  file = {Lotz23.pdf:pdf\\Lotz23.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
-  journal = {arXiv:2106.13150},
+  journal = {Journal of Medical Imaging},
   automatic = {yes},
-  all_ss_ids = {['952bdfe8a2732d537363028114718edad19bc451', '5cf666b6326b85a31b4e2759031392f0a49351b2']},
-  gscites = {6},
+  all_ss_ids = {['5cf666b6326b85a31b4e2759031392f0a49351b2']},
+  citation-count = {4},
+  volume = {10},
+  pmid = {38074626},
 }
 
 @article{Louz14,
@@ -19546,7 +19576,7 @@ @article{Mahm24
   doi = {10.1093/bjrai/ubae003},
   year = {2024},
   abstract = {Abstract
-                          The adoption of artificial intelligence (AI) tools in medicine poses challenges to existing clinical workflows. This commentary discusses the necessity of context-specific quality assurance (QA), emphasizing the need for robust QA measures with quality control (QC) procedures that encompass (1) acceptance testing (AT) before clinical use, (2) continuous QC monitoring, and (3) adequate user training. The discussion also covers essential components of AT and QA, illustrated with real-world examples. We also highlight what we see as the shared responsibility of manufacturers or vendors, regulators, healthcare systems, medical physicists, and clinicians to enact appropriate testing and oversight to ensure a safe and equitable transformation of medicine through AI.},
+                            The adoption of artificial intelligence (AI) tools in medicine poses challenges to existing clinical workflows. This commentary discusses the necessity of context-specific quality assurance (QA), emphasizing the need for robust QA measures with quality control (QC) procedures that encompass (1) acceptance testing (AT) before clinical use, (2) continuous QC monitoring, and (3) adequate user training. The discussion also covers essential components of AT and QA, illustrated with real-world examples. We also highlight what we see as the shared responsibility of manufacturers or vendors, regulators, healthcare systems, medical physicists, and clinicians to enact appropriate testing and oversight to ensure a safe and equitable transformation of medicine through AI.},
   url = {http://dx.doi.org/10.1093/bjrai/ubae003},
   file = {Mahm24.pdf:pdf\\Mahm24.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -20081,15 +20111,15 @@ @conference{Mann16c
   booktitle = RSNA,
   year = {2016},
   abstract = {PURPOSE: White matter (WM) and gray matter (GM) respond differently to ischemia and thrombolytic treatment. Being able to differentiate WM/GM in CT enables tissue dependent perfusion analysis and automated detection of stroke related pathology. In this work we show the feasibility of segmenting WM/GM in 4DCT images of acute ischemic stroke patients.
-
-                                                       METHOD AND MATERIALS: In total 18 stroke patients who received both a 4DCT and followup MR scan were included in this retrospective study. CT imaging was done on a 320 row scanner with 19 or 24 volumetric acquisitions after contrast injection resulting in 512x512x320 isotropic voxels of 0.5 mm. T1w imaging was done on a 1.5T MR scanner resulting in approximately 384x318x26 voxels of 0.6x0.6x5.5 mm. The MR image was segmented with FSL tools and served as reference standard to train and evaluate the method. The method starts with brain segmentation by atlas registration followed by a refinement using a geodesic active contour with dominating advection term steered by a gradient based speed function. Within the segmented brain, three groups of features are then extracted: intensity, contextual and temporal, including a multiscale representation of the temporal average image weighted according to the exposures of the individual time points to maximize the signaltonoise ratios. In total 120 features were then fed into a nonlinear support vector machine with Gaussian radial basis kernel. A leaveonepatient out cross validation was carried out. Segmentation results were visually inspected for overall quality. Dice coefficient (DC) and 95th percentile Hausdorff distance (HD) were reported.
-
-                                                       RESULTS: The segmentations were evaluated as good with the separation of WM/GM at the cortex good to excellent. GM segmentation at the cortex had generally less thickness variations compared to the reference standard. DC were 0.79+-0.06 and 0.77+-0.06, 95% HD were 8.71+-3.22 and 7.11+-3.93 mm, for WM and GM, respectively.
-
-                                                       CONCLUSION: WM and GM segmentation in 4DCT is feasible.
-
-
-                                                       CLINICAL RELEVANCE/APPLICATION: WM and GM segmentation in 4DCT enables tissue dependent perfusion analysis and may increase sensitivity of detecting core and penumbra. Volume measurements of WM and GM normalized with the contralateral side may yield an important diagnostic parameter in the acute phase of ischemia.},
+  
+                                                         METHOD AND MATERIALS: In total 18 stroke patients who received both a 4DCT and followup MR scan were included in this retrospective study. CT imaging was done on a 320 row scanner with 19 or 24 volumetric acquisitions after contrast injection resulting in 512x512x320 isotropic voxels of 0.5 mm. T1w imaging was done on a 1.5T MR scanner resulting in approximately 384x318x26 voxels of 0.6x0.6x5.5 mm. The MR image was segmented with FSL tools and served as reference standard to train and evaluate the method. The method starts with brain segmentation by atlas registration followed by a refinement using a geodesic active contour with dominating advection term steered by a gradient based speed function. Within the segmented brain, three groups of features are then extracted: intensity, contextual and temporal, including a multiscale representation of the temporal average image weighted according to the exposures of the individual time points to maximize the signaltonoise ratios. In total 120 features were then fed into a nonlinear support vector machine with Gaussian radial basis kernel. A leaveonepatient out cross validation was carried out. Segmentation results were visually inspected for overall quality. Dice coefficient (DC) and 95th percentile Hausdorff distance (HD) were reported.
+  
+                                                         RESULTS: The segmentations were evaluated as good with the separation of WM/GM at the cortex good to excellent. GM segmentation at the cortex had generally less thickness variations compared to the reference standard. DC were 0.79+-0.06 and 0.77+-0.06, 95% HD were 8.71+-3.22 and 7.11+-3.93 mm, for WM and GM, respectively.
+  
+                                                         CONCLUSION: WM and GM segmentation in 4DCT is feasible.
+  
+  
+                                                         CLINICAL RELEVANCE/APPLICATION: WM and GM segmentation in 4DCT enables tissue dependent perfusion analysis and may increase sensitivity of detecting core and penumbra. Volume measurements of WM and GM normalized with the contralateral side may yield an important diagnostic parameter in the acute phase of ischemia.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -20382,16 +20412,16 @@ @article{Meij15c
   pages = {1282-1290},
   doi = {10.1002/mrm.26024},
   abstract = {Purpose
-                                                       There is currently controversy regarding the benefits of deconvolution-based parameters in stroke imaging, with studies suggesting a similar infarct prediction using summary parameters. We investigate here the performance of deconvolution-based parameters and summary parameters for dynamic-susceptibility contrast (DSC) MRI analysis, with particular emphasis on precision.
-
-                                                       Methods
-                                                       Numerical simulations were used to assess the contribution of noise and arterial input function (AIF) variability to measurement precision. A realistic AIF range was defined based on in vivo data from an acute stroke clinical study. The simulated tissue curves were analyzed using two popular singular value decomposition (SVD) based algorithms, as well as using summary parameters.
-
-                                                       Results
-                                                       SVD-based deconvolution methods were found to considerably reduce the AIF-dependency, but a residual AIF bias remained on the calculated parameters. Summary parameters, in turn, show a lower sensitivity to noise. The residual AIF-dependency for deconvolution methods and the large AIF-sensitivity of summary parameters was greatly reduced when normalizing them relative to normal tissue.
-
-                                                       Conclusion
-                                                       Consistent with recent studies suggesting high performance of summary parameters in infarct prediction, our results suggest that DSC-MRI analysis using properly normalized summary parameters may have advantages in terms of lower noise and AIF-sensitivity as compared to commonly used deconvolution methods.},
+                                                         There is currently controversy regarding the benefits of deconvolution-based parameters in stroke imaging, with studies suggesting a similar infarct prediction using summary parameters. We investigate here the performance of deconvolution-based parameters and summary parameters for dynamic-susceptibility contrast (DSC) MRI analysis, with particular emphasis on precision.
+  
+                                                         Methods
+                                                         Numerical simulations were used to assess the contribution of noise and arterial input function (AIF) variability to measurement precision. A realistic AIF range was defined based on in vivo data from an acute stroke clinical study. The simulated tissue curves were analyzed using two popular singular value decomposition (SVD) based algorithms, as well as using summary parameters.
+  
+                                                         Results
+                                                         SVD-based deconvolution methods were found to considerably reduce the AIF-dependency, but a residual AIF bias remained on the calculated parameters. Summary parameters, in turn, show a lower sensitivity to noise. The residual AIF-dependency for deconvolution methods and the large AIF-sensitivity of summary parameters was greatly reduced when normalizing them relative to normal tissue.
+  
+                                                         Conclusion
+                                                         Consistent with recent studies suggesting high performance of summary parameters in infarct prediction, our results suggest that DSC-MRI analysis using properly normalized summary parameters may have advantages in terms of lower noise and AIF-sensitivity as compared to commonly used deconvolution methods.},
   file = {Meij15c.pdf:pdf\\Meij15c.pdf:PDF},
   optnote = {DIAG},
   pmid = {26519871},
@@ -20405,14 +20435,14 @@ @conference{Meij16
   booktitle = RSNA,
   year = {2016},
   abstract = {PURPOSE: Due to partial volume effects, accurate segmentation of small cerebral vessels on {CT} is a challenge. We present a novel technique that incorporates local intensity histogram information to segment the cerebral vasculature on {CT} perfusion ({CTP}) scans for suspected ischemic stroke.
-
-                                                       METHOD AND MATERIALS: A pattern recognition approach based on global and local image features followed by a random forest classifier is proposed. The features consist of an automatically computed brain mask denoting intracranial tissue, the first volume of the {CTP} scan, the {CTP} scan temporal average weighted according to the individual exposures to maximize signal-to-noise ratio, the weighted temporal variance ({WTV}), and local histogram features of the {WTV} calculated in a neighborhood of 9x9x9 voxels around a centered voxel. The mean, standard deviation, entropy and mode of the histogram are extracted as local feature values. In total 26 patients that underwent {CTP} for suspicion of stroke were included in this study. The {CTP} was acquired on a 320-detector row scanner. Image size was 512x512x320 voxels by 19 time points with voxel sizes of approximately 0.5 mm. Training was done on 8 patients with manually annotated data. The remaining 18 patients were used as testing set. Segmentations were visually inspected for completeness and overall quality. 3D-patches including the {M2}/{M3} segments of the middle cerebral artery were manually annotated for quantitative evaluation. The modified Hausdorff distance ({MHD}) (maximum of the median {HD}s) and the accuracy (true positive + true negative voxels divided by amount of voxels in a patch) of the segmentation were reported for the annotated patches.
-
-                                                       RESULTS: Overall the method was capable of segmenting the complete cerebral vasculature with inclusion of very small distal vessels. Parts of one internal carotid was missed in one patient because of clipping artefacts. In 3 patients false positive voxels were observed in the skull base region near the internal carotid artery and cavernous sinus. The {MHD} was 0.51A-A?A 1/2 0.28 mm, which is similar to the voxel spacing, and the accuracy was 0.97A-A?A 1/2 0.01.
-
-                                                       CONCLUSION: Our approach provides high-quality segmentation of small cerebral vessels from {CTP} data.
-
-                                                       CLINICAL RELEVANCE/APPLICATION: The high quality segmentation provided by our approach is an important step towards the automated localization and evaluation of vascular pathology in acute stroke patients.},
+  
+                                                         METHOD AND MATERIALS: A pattern recognition approach based on global and local image features followed by a random forest classifier is proposed. The features consist of an automatically computed brain mask denoting intracranial tissue, the first volume of the {CTP} scan, the {CTP} scan temporal average weighted according to the individual exposures to maximize signal-to-noise ratio, the weighted temporal variance ({WTV}), and local histogram features of the {WTV} calculated in a neighborhood of 9x9x9 voxels around a centered voxel. The mean, standard deviation, entropy and mode of the histogram are extracted as local feature values. In total 26 patients that underwent {CTP} for suspicion of stroke were included in this study. The {CTP} was acquired on a 320-detector row scanner. Image size was 512x512x320 voxels by 19 time points with voxel sizes of approximately 0.5 mm. Training was done on 8 patients with manually annotated data. The remaining 18 patients were used as testing set. Segmentations were visually inspected for completeness and overall quality. 3D-patches including the {M2}/{M3} segments of the middle cerebral artery were manually annotated for quantitative evaluation. The modified Hausdorff distance ({MHD}) (maximum of the median {HD}s) and the accuracy (true positive + true negative voxels divided by amount of voxels in a patch) of the segmentation were reported for the annotated patches.
+  
+                                                         RESULTS: Overall the method was capable of segmenting the complete cerebral vasculature with inclusion of very small distal vessels. Parts of one internal carotid was missed in one patient because of clipping artefacts. In 3 patients false positive voxels were observed in the skull base region near the internal carotid artery and cavernous sinus. The {MHD} was 0.51A-A?A 1/2 0.28 mm, which is similar to the voxel spacing, and the accuracy was 0.97A-A?A 1/2 0.01.
+  
+                                                         CONCLUSION: Our approach provides high-quality segmentation of small cerebral vessels from {CTP} data.
+  
+                                                         CLINICAL RELEVANCE/APPLICATION: The high quality segmentation provided by our approach is an important step towards the automated localization and evaluation of vascular pathology in acute stroke patients.},
   file = {Meij16.pdf:pdf\\Meij16.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
 }
@@ -20423,13 +20453,13 @@ @conference{Meij17
   booktitle = ECR,
   year = {2017},
   abstract = {PURPOSE
-                                                       To perform a pilot study to explore the effect of a new post-processing technique for 4D-CTA on speed and accuracy of the detection of intracranial vessel occlusions in acute stroke. This technique color-codes the contrast arrival time in the cerebral vasculature in 4D-CTA so that abnormally delayed vascular territories are easily detected.
-                                                       METHOD AND MATERIALS
-                                                       We selected 10 patients without and 10 patients with a confirmed single vessel occlusion on CTA from our database of acute ischemic stroke patients, so that occlusions of the ICA, MCA, ACA and PCA of varying subtlety were included. Whole-brain CT perfusion was performed on a 320 detector-row scanner. Color-coded 4D-CTA images were obtained by centering the color scale of vessel time-to-peak (TTP) on the modus of the TTP histogram. Temporal MIP of 4D-CTA with and without color-coding were evaluated in random order for the presence of vessel occlusion by to two neuroradiologists. Time-to-detection and accuracy of detection of vessel occlusions were evaluated.
-                                                       RESULTS
-                                                       One false-positive vessel occlusion was rated on color-mapping by both observers. Overall, the average time-to-detection decreased from 37.0s to 19.4s (p<0.03) and the average accuracy of vessel occlusion detection increased from 0.825 to 0.85 with color-mapping.
-                                                       CONCLUSION
-                                                       Color-mapping of cerebral vasculature in 4D-CTA improves the speed and may improve the accuracy of the detection of vessel occlusions in acute stroke patients.},
+                                                         To perform a pilot study to explore the effect of a new post-processing technique for 4D-CTA on speed and accuracy of the detection of intracranial vessel occlusions in acute stroke. This technique color-codes the contrast arrival time in the cerebral vasculature in 4D-CTA so that abnormally delayed vascular territories are easily detected.
+                                                         METHOD AND MATERIALS
+                                                         We selected 10 patients without and 10 patients with a confirmed single vessel occlusion on CTA from our database of acute ischemic stroke patients, so that occlusions of the ICA, MCA, ACA and PCA of varying subtlety were included. Whole-brain CT perfusion was performed on a 320 detector-row scanner. Color-coded 4D-CTA images were obtained by centering the color scale of vessel time-to-peak (TTP) on the modus of the TTP histogram. Temporal MIP of 4D-CTA with and without color-coding were evaluated in random order for the presence of vessel occlusion by to two neuroradiologists. Time-to-detection and accuracy of detection of vessel occlusions were evaluated.
+                                                         RESULTS
+                                                         One false-positive vessel occlusion was rated on color-mapping by both observers. Overall, the average time-to-detection decreased from 37.0s to 19.4s (p<0.03) and the average accuracy of vessel occlusion detection increased from 0.825 to 0.85 with color-mapping.
+                                                         CONCLUSION
+                                                         Color-mapping of cerebral vasculature in 4D-CTA improves the speed and may improve the accuracy of the detection of vessel occlusions in acute stroke patients.},
   file = {Meij17.pdf:pdf\\Meij17.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
 }
@@ -20441,7 +20471,7 @@ @article{Meij17a
   year = {2017},
   volume = {7},
   abstract = {A robust method is presented for the segmentation of the full cerebral vasculature in 4-dimensional (4D) computed tomography (CT). The method consists of candidate vessel selection, feature extraction, random forest classiffcation and postprocessing. Image features include among others the weighted temporal variance image and parameters, including entropy, of an intensity histogram in a local region at di
-                                                       erent scales. These histogram parameters revealed to be a strong feature in the detection of vessels regardless of shape and size. The method was trained and tested on a large database of 264 patients with suspicion of acute ischemic stroke who underwent 4D CT in our hospital in the period January 2014 to December 2015. In this database there is a large variety of patients observed in every day clinical practice. The method was trained on 19 4D CT images of patients with manual annotations by two trained medical assistants. Five subvolumes representing different regions of the cerebral vasculature were annotated in each image in the training set. The evaluation of the method was done on 242 patients. One out of fve subvolumes was randomly annotated in 159 patients and was used for quantitative evaluation. Segmentations were inspected visually for the entire study cohort to assess failures. A total of 16 (<8%) patients showed severe under- or over-segmentation and were reported as failures. Quantitative evaluation in comparison to the reference annotation showed a Dice coeffcient of 0.91 +- 0.07 and a modiffed Hausdorff distance of 0.23 +- 0.22 mm, which is smaller than voxel spacing.},
+                                                         erent scales. These histogram parameters revealed to be a strong feature in the detection of vessels regardless of shape and size. The method was trained and tested on a large database of 264 patients with suspicion of acute ischemic stroke who underwent 4D CT in our hospital in the period January 2014 to December 2015. In this database there is a large variety of patients observed in every day clinical practice. The method was trained on 19 4D CT images of patients with manual annotations by two trained medical assistants. Five subvolumes representing different regions of the cerebral vasculature were annotated in each image in the training set. The evaluation of the method was done on 242 patients. One out of fve subvolumes was randomly annotated in 159 patients and was used for quantitative evaluation. Segmentations were inspected visually for the entire study cohort to assess failures. A total of 16 (<8%) patients showed severe under- or over-segmentation and were reported as failures. Quantitative evaluation in comparison to the reference annotation showed a Dice coeffcient of 0.91 +- 0.07 and a modiffed Hausdorff distance of 0.23 +- 0.22 mm, which is smaller than voxel spacing.},
   doi = {10.1038/s41598-017-15617-w},
   file = {Meij17a.pdf:pdf\\Meij17a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -20462,7 +20492,7 @@ @inproceedings{Meij18
   pages = {105751Q},
   doi = {10.1117/12.2292974},
   abstract = {Segmentation of the arteries and veins of the cerebral vasculature is important for improved visualization and for the detection of vascular related pathologies including arterio-venous malformations. We propose a three dimensional fully convolutational neural network (CNN), with Time-to-Signal images as input, extended with the distance to the center of gravity of the brain as spatial feature integrated at the abstract level of the CNN. The method is trained and validated on 6 and tested on 4 4D CT patient imaging data. The reference standard was acquired by manual annotations by an experienced observer. Quantitative evaluation shows a mean Dice similarity coefficient of 0.936 +- 0.027 and 0.973 +- 0.012, a mean absolute volume di
-                                                       erence of 4.36 +- 5.47 % and 1.79 +- 2.26 % for artery and vein respectively and an overall accuracy of 0.962 +- 0.017. Average calculation time per volume on the test set was approximately one minute. Our method shows promising results and enables fast and accurate segmentation of arteries and veins in full 4D CT imaging data.},
+                                                         erence of 4.36 +- 5.47 % and 1.79 +- 2.26 % for artery and vein respectively and an overall accuracy of 0.962 +- 0.017. Average calculation time per volume on the test set was approximately one minute. Our method shows promising results and enables fast and accurate segmentation of arteries and veins in full 4D CT imaging data.},
   file = {Meij18.pdf:pdf\\Meij18.pdf:PDF},
   optnote = {DIAG, Radiology},
   month = {2},
@@ -20481,12 +20511,12 @@ @article{Meij18a
   pages = {421-426},
   doi = {10.1016/j.wneu.2018.02.189},
   abstract = {Background
-                                                       In case of carotid artery occlusion, the risk and extent of ischemic cerebral damage is highly dependent on the pathways of collateral flow, including the anatomy of the circle of Willis.
-                                                       In this report, cases are presented to illustrate that {4D-CTA} can be considered as a noninvasive alternative to DSA for the evaluation of circle of Willis collateral flow.
-                                                       Case Description
-                                                       Five patients with unilateral internal carotid artery ({ICA}) occlusion underwent {4D-CTA} for the evaluation of intracranial hemodynamics. Next to a visual evaluation of {4D-CTA}, temporal information was visualized using a normalized color scale on the cerebral vasculature, which enabled quantification of the contrast bolus arrival time. In these patients, {4D-CTA} demonstrated dominant {MCA} blood supply on the side of {ICA} occlusion originating either from the contralateral {ICA} or from the posterior circulation via the communicating arteries.
-                                                       Conclusions
-                                                       Temporal dynamics of collateral flow in the circle of Willis can be depicted with {4D-CTA} in patients with a unilateral carotid artery occlusion.},
+                                                         In case of carotid artery occlusion, the risk and extent of ischemic cerebral damage is highly dependent on the pathways of collateral flow, including the anatomy of the circle of Willis.
+                                                         In this report, cases are presented to illustrate that {4D-CTA} can be considered as a noninvasive alternative to DSA for the evaluation of circle of Willis collateral flow.
+                                                         Case Description
+                                                         Five patients with unilateral internal carotid artery ({ICA}) occlusion underwent {4D-CTA} for the evaluation of intracranial hemodynamics. Next to a visual evaluation of {4D-CTA}, temporal information was visualized using a normalized color scale on the cerebral vasculature, which enabled quantification of the contrast bolus arrival time. In these patients, {4D-CTA} demonstrated dominant {MCA} blood supply on the side of {ICA} occlusion originating either from the contralateral {ICA} or from the posterior circulation via the communicating arteries.
+                                                         Conclusions
+                                                         Temporal dynamics of collateral flow in the circle of Willis can be depicted with {4D-CTA} in patients with a unilateral carotid artery occlusion.},
   file = {Meij18a.pdf:pdf\\Meij18a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {29530689},
@@ -20503,13 +20533,13 @@ @conference{Meij18b
   booktitle = RSNA,
   year = {2018},
   abstract = {PURPOSE: Segmentation of the complete cerebral vasculature in {4D-CTA} is important for improved visualization, automated pathology detection and assessment of the collateral flow. We present a deep learning approach to segment the complete cerebral vasculature in {4D-CTA} of patients with suspected stroke.
-                                                       MATERIALS AND METHODS: In total 162 patients that underwent {4D-CTA} for suspicion of stroke were included in this study. The scans were acquired on a 320-detector row scanner (Canon Medical Systems Corporation, Japan). Image size was 512x512x320 voxels by 19 time points with isotropic voxel sizes of approximately 0.5 mm. A 3D fully convolutional neural network ({CNN}), U-Net, was proposed with integration of a spatial feature in the final convolutional layer of the network. The weighted temporal average and variance were derived from the 4D-CTA and used as input for the network. As spatial feature the Euclidean distance from the center of the brain to the skull was used. Training was done on 19 patients with manually annotated data. The remaining 143 patients were used as testing set. Segmentations were visually inspected for completeness and overall quality. Two observers manually annotated three dimensional sub-volumes throughout the brain to include different sized vessels for quantitative evaluation. The Dice similarity coefficient ({DSC}) and Mean Contour Distance ({MCD}) of the segmentations were reported.
-                                                       RESULTS
-                                                       Overall the method was capable of segmenting the complete cerebral vasculature. Smaller distal vessels (e.g. M3) showed similar segmentation results as the larger vessels (e.g. internal carotid artery). The {DSC} was 0.91+-0.08 and the {MCD} was 0.26+-0.24 mm which is below voxel spacing. Computation time was less than 90 seconds for processing a full {4D-CTA} data set.
-                                                       CONCLUSION:
-                                                       A 3D U-Net with spatial features provides fast, robust and accurate segmentations of the full cerebral vasculature in {4D-CTA}.
-                                                       Clinical Relevance
-                                                       The high quality segmentation provided by our method is an important step towards the automated localization and evaluation of vascular pathology in acute stroke patients.},
+                                                         MATERIALS AND METHODS: In total 162 patients that underwent {4D-CTA} for suspicion of stroke were included in this study. The scans were acquired on a 320-detector row scanner (Canon Medical Systems Corporation, Japan). Image size was 512x512x320 voxels by 19 time points with isotropic voxel sizes of approximately 0.5 mm. A 3D fully convolutional neural network ({CNN}), U-Net, was proposed with integration of a spatial feature in the final convolutional layer of the network. The weighted temporal average and variance were derived from the 4D-CTA and used as input for the network. As spatial feature the Euclidean distance from the center of the brain to the skull was used. Training was done on 19 patients with manually annotated data. The remaining 143 patients were used as testing set. Segmentations were visually inspected for completeness and overall quality. Two observers manually annotated three dimensional sub-volumes throughout the brain to include different sized vessels for quantitative evaluation. The Dice similarity coefficient ({DSC}) and Mean Contour Distance ({MCD}) of the segmentations were reported.
+                                                         RESULTS
+                                                         Overall the method was capable of segmenting the complete cerebral vasculature. Smaller distal vessels (e.g. M3) showed similar segmentation results as the larger vessels (e.g. internal carotid artery). The {DSC} was 0.91+-0.08 and the {MCD} was 0.26+-0.24 mm which is below voxel spacing. Computation time was less than 90 seconds for processing a full {4D-CTA} data set.
+                                                         CONCLUSION:
+                                                         A 3D U-Net with spatial features provides fast, robust and accurate segmentations of the full cerebral vasculature in {4D-CTA}.
+                                                         Clinical Relevance
+                                                         The high quality segmentation provided by our method is an important step towards the automated localization and evaluation of vascular pathology in acute stroke patients.},
   file = {Meij18b.pdf:pdf\\Meij18b.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
 }
@@ -20520,14 +20550,14 @@ @conference{Meij18c
   booktitle = ESNR,
   year = {2018},
   abstract = {{PURPOSE}
-                                                       Nowadays {4D-CTA} is available as a non-invasive alternative to conventional angiography, with reported high diagnostic accuracy in the evaluation of different neurovascular disorders, including arteriovenous shunts and collateral flow pathways. Optimized processing of {4D-CTA} is crucial, considering the large amount of data generated. Enhanced visualization of {4D-CTA} can be achieved by applying color-mapping of temporal information in the cerebral vasculature.
-                                                       {METHOD AND MATERIALS}
-                                                       Color-mapping processing of {4D-CTA} is achieved in two steps. First, the full cerebral vasculature is segmented by features extraction and random forest classification. Second, the color-scale is adjusted using the histogram using the histogram of the arrival times of the segmented vessels. Early contrast bolus arrival(e.g. healthy internal carotid artery) is labeled red, intermediate arrival yellow, and delayed contrast arrival is labeled blue.
-                                                       Color-mapping of {4D-CTA} was applied in patients suspected of cranial arteriovenous shunts, and in patients with unilateral carotid artery occlusion for the evaluation of circle of Willis collateral flow. The patients were scanned on a wide-row 320 slice detector {CT} (Toshiba Aquilion {ONE}), enabling whole-head coverage at high temporal resolution.
-                                                       {RESULTS}
-                                                       Arterialization of venous vascular structures is the hallmark of arterio-venous shunts, which is easily and accurately identified on color-mapping of {4D-CTA}. Temporal dynamics of collateral flow in the circle of Willis is adequately depicted with {4D-CTA} in patients with unilateral carotid artery occlusion.
-                                                       {CONCLUSION}
-                                                       Color-mapping of {4D-CTA} accurately displays temporal information of the cerebral vasculature, which can facilitate the detection of arterio-venous shunts and the evaluation of collateral flow in intracranial steno-occlusive disease.},
+                                                         Nowadays {4D-CTA} is available as a non-invasive alternative to conventional angiography, with reported high diagnostic accuracy in the evaluation of different neurovascular disorders, including arteriovenous shunts and collateral flow pathways. Optimized processing of {4D-CTA} is crucial, considering the large amount of data generated. Enhanced visualization of {4D-CTA} can be achieved by applying color-mapping of temporal information in the cerebral vasculature.
+                                                         {METHOD AND MATERIALS}
+                                                         Color-mapping processing of {4D-CTA} is achieved in two steps. First, the full cerebral vasculature is segmented by features extraction and random forest classification. Second, the color-scale is adjusted using the histogram using the histogram of the arrival times of the segmented vessels. Early contrast bolus arrival(e.g. healthy internal carotid artery) is labeled red, intermediate arrival yellow, and delayed contrast arrival is labeled blue.
+                                                         Color-mapping of {4D-CTA} was applied in patients suspected of cranial arteriovenous shunts, and in patients with unilateral carotid artery occlusion for the evaluation of circle of Willis collateral flow. The patients were scanned on a wide-row 320 slice detector {CT} (Toshiba Aquilion {ONE}), enabling whole-head coverage at high temporal resolution.
+                                                         {RESULTS}
+                                                         Arterialization of venous vascular structures is the hallmark of arterio-venous shunts, which is easily and accurately identified on color-mapping of {4D-CTA}. Temporal dynamics of collateral flow in the circle of Willis is adequately depicted with {4D-CTA} in patients with unilateral carotid artery occlusion.
+                                                         {CONCLUSION}
+                                                         Color-mapping of {4D-CTA} accurately displays temporal information of the cerebral vasculature, which can facilitate the detection of arterio-venous shunts and the evaluation of collateral flow in intracranial steno-occlusive disease.},
   file = {Meij18c.pdf:pdf\\Meij18c.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
 }
@@ -21160,16 +21190,16 @@ @article{Mert17
   pages = {935-948},
   doi = {10.1002/mp.12077},
   abstract = {PURPOSE:
-                                                       In breast imaging, radiological in vivo images, such as x-ray mammography and magnetic resonance imaging (MRI), are used for tumor detection, diagnosis, and size determination. After excision, the specimen is typically sliced into slabs and a small subset is sampled. Histopathological imaging of the stained samples is used as the gold standard for characterization of the tumor microenvironment. A 3D volume reconstruction of the whole specimen from the 2D slabs could facilitate bridging the gap between histology and in vivo radiological imaging. This task is challenging, however, due to the large deformation that the breast tissue undergoes after surgery and the significant undersampling of the specimen obtained in histology. In this work, we present a method to reconstruct a coherent 3D volume from 2D digital radiographs of the specimen slabs.
-
-                                                       METHODS:
-                                                       To reconstruct a 3D breast specimen volume, we propose the use of multiple target neighboring slices, when deforming each 2D slab radiograph in the volume, rather than performing pairwise registrations. The algorithm combines neighborhood slice information with free-form deformations, which enables a flexible, nonlinear deformation to be computed subject to the constraint that a coherent 3D volume is obtained. The neighborhood information provides adequate constraints, without the need for any additional regularization terms.
-
-                                                       RESULTS:
-                                                       The volume reconstruction algorithm is validated on clinical mastectomy samples using a quantitative assessment of the volume reconstruction smoothness and a comparison with a whole specimen 3D image acquired for validation before slicing. Additionally, a target registration error of 5 mm (comparable to the specimen slab thickness of 4 mm) was obtained for five cases. The error was computed using manual annotations from four observers as gold standard, with interobserver variability of 3.4 mm. Finally, we illustrate how the reconstructed volumes can be used to map histology images to a 3D specimen image of the whole sample (either MRI or CT).
-
-                                                       CONCLUSIONS:
-                                                       Qualitative and quantitative assessment has illustrated the benefit of using our proposed methodology to reconstruct a coherent specimen volume from serial slab radiographs. To our knowledge, this is the first method that has been applied to clinical breast cases, with the goal of reconstructing a whole specimen sample. The algorithm can be used as part of the pipeline of mapping histology images to ex vivo and ultimately in vivo radiological images of the breast.},
+                                                         In breast imaging, radiological in vivo images, such as x-ray mammography and magnetic resonance imaging (MRI), are used for tumor detection, diagnosis, and size determination. After excision, the specimen is typically sliced into slabs and a small subset is sampled. Histopathological imaging of the stained samples is used as the gold standard for characterization of the tumor microenvironment. A 3D volume reconstruction of the whole specimen from the 2D slabs could facilitate bridging the gap between histology and in vivo radiological imaging. This task is challenging, however, due to the large deformation that the breast tissue undergoes after surgery and the significant undersampling of the specimen obtained in histology. In this work, we present a method to reconstruct a coherent 3D volume from 2D digital radiographs of the specimen slabs.
+  
+                                                         METHODS:
+                                                         To reconstruct a 3D breast specimen volume, we propose the use of multiple target neighboring slices, when deforming each 2D slab radiograph in the volume, rather than performing pairwise registrations. The algorithm combines neighborhood slice information with free-form deformations, which enables a flexible, nonlinear deformation to be computed subject to the constraint that a coherent 3D volume is obtained. The neighborhood information provides adequate constraints, without the need for any additional regularization terms.
+  
+                                                         RESULTS:
+                                                         The volume reconstruction algorithm is validated on clinical mastectomy samples using a quantitative assessment of the volume reconstruction smoothness and a comparison with a whole specimen 3D image acquired for validation before slicing. Additionally, a target registration error of 5 mm (comparable to the specimen slab thickness of 4 mm) was obtained for five cases. The error was computed using manual annotations from four observers as gold standard, with interobserver variability of 3.4 mm. Finally, we illustrate how the reconstructed volumes can be used to map histology images to a 3D specimen image of the whole sample (either MRI or CT).
+  
+                                                         CONCLUSIONS:
+                                                         Qualitative and quantitative assessment has illustrated the benefit of using our proposed methodology to reconstruct a coherent specimen volume from serial slab radiographs. To our knowledge, this is the first method that has been applied to clinical breast cases, with the goal of reconstructing a whole specimen sample. The algorithm can be used as part of the pipeline of mapping histology images to ex vivo and ultimately in vivo radiological images of the breast.},
   file = {Mert17.pdf:pdf\\Mert17.pdf:PDF},
   optnote = {DIAG},
   pmid = {28064435},
@@ -21542,7 +21572,7 @@ @article{Mies22
   doi = {10.1242/dmm.046342},
   year = {2022},
   abstract = {ABSTRACT
-                                        In the glomerulus, Bowman's space is formed by a continuum of glomerular epithelial cells. In focal segmental glomerulosclerosis (FSGS), glomeruli show segmental scarring, a result of activated parietal epithelial cells (PECs) invading the glomerular tuft. The segmental scars interrupt the epithelial continuum. However, non-sclerotic segments seem to be preserved even in glomeruli with advanced lesions. We studied the histology of the segmental pattern in Munich Wistar Fr\"{o}mter rats, a model for secondary FSGS. Our results showed that matrix layers lined with PECs cover the sclerotic lesions. These PECs formed contacts with podocytes of the uninvolved tuft segments, restoring the epithelial continuum. Formed Bowman's spaces were still connected to the tubular system. In biopsies of patients with secondary FSGS, we also detected matrix layers formed by PECs, separating the uninvolved from the sclerotic glomerular segments. PECs have a major role in the formation of glomerulosclerosis; we show here that in FSGS they also restore the glomerular epithelial cell continuum that surrounds Bowman's space. This process may be beneficial and indispensable for glomerular filtration in the uninvolved segments of sclerotic glomeruli.},
+                                          In the glomerulus, Bowman's space is formed by a continuum of glomerular epithelial cells. In focal segmental glomerulosclerosis (FSGS), glomeruli show segmental scarring, a result of activated parietal epithelial cells (PECs) invading the glomerular tuft. The segmental scars interrupt the epithelial continuum. However, non-sclerotic segments seem to be preserved even in glomeruli with advanced lesions. We studied the histology of the segmental pattern in Munich Wistar Fr\"{o}mter rats, a model for secondary FSGS. Our results showed that matrix layers lined with PECs cover the sclerotic lesions. These PECs formed contacts with podocytes of the uninvolved tuft segments, restoring the epithelial continuum. Formed Bowman's spaces were still connected to the tubular system. In biopsies of patients with secondary FSGS, we also detected matrix layers formed by PECs, separating the uninvolved from the sclerotic glomerular segments. PECs have a major role in the formation of glomerulosclerosis; we show here that in FSGS they also restore the glomerular epithelial cell continuum that surrounds Bowman's space. This process may be beneficial and indispensable for glomerular filtration in the uninvolved segments of sclerotic glomeruli.},
   url = {http://dx.doi.org/10.1242/dmm.046342},
   file = {Mies22.pdf:pdf\\Mies22.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -21708,9 +21738,9 @@ @mastersthesis{Mooi19
   author = {Germonda Mooij},
   title = {Using GANs to synthetically stain histopathological images to generate training data for automatic mitosis detection in breast tissue},
   abstract = {Generative adversarial networks (GANs) have been proven effective at mapping medical images from one domain to another (e.g. from CT to MRI).
-                              In this study we investigate the effectiveness of GANs at mapping images of breast tissue between histopathological stains.
-                              Breast cancer is the most common cancer in women worldwide. Counting mitotic figures in histological images of breast cancer tissue has been shown to be a reliable and independent prognostic marker. Most successful methods for automatic counting involve training deep neural networks on H&E stained slides. This training requires extensive manual annotations of mitotic figures in H&E stained slides, which suffers from a low inter-observer agreement. Manual counting in PHH3 stained slides has a much higher inter-observer agreement.
-                              In this project we aimed to train GANs to map PHH3 slides to synthetic H&E slides and vice versa. A mitosis classifier is used to quantify the quality of the synthetic images, by comparing its performance after training on synthetic images with training on real images.},
+                                In this study we investigate the effectiveness of GANs at mapping images of breast tissue between histopathological stains.
+                                Breast cancer is the most common cancer in women worldwide. Counting mitotic figures in histological images of breast cancer tissue has been shown to be a reliable and independent prognostic marker. Most successful methods for automatic counting involve training deep neural networks on H&E stained slides. This training requires extensive manual annotations of mitotic figures in H&E stained slides, which suffers from a low inter-observer agreement. Manual counting in PHH3 stained slides has a much higher inter-observer agreement.
+                                In this project we aimed to train GANs to map PHH3 slides to synthetic H&E slides and vice versa. A mitosis classifier is used to quantify the quality of the synthetic images, by comparing its performance after training on synthetic images with training on real images.},
   file = {Mooi19.pdf:pdf/Mooi19.pdf:PDF},
   optnote = {DIAG},
   school = {Radboud University Medical Center},
@@ -21736,17 +21766,19 @@ @article{Mook09
   gscites = {64},
 }
 
-@inproceedings{Moor18a,
-  author = {Timothy de Moor and Alejandro Rodriguez-Ruiz and Ritse Mann and Jonas Teuwen},
-  title = {Automated soft tissue lesion detection and segmentation in digital mammography using a u-net deep learning network},
-  booktitle = {International Workshop on Breast Imaging},
+@inproceedings{Moor18,
+  author = {de Moor, Timothy and Rodriguez-Ruiz, Alejandro and Mann, Ritse and Gubern M\'{e}rida, Albert and Teuwen, Jonas},
+  title = {Automated lesion detection and segmentation in digital mammography using a u-net deep learning network},
+  doi = {10.1117/12.2318326},
   year = {2018},
-  url = {https://arxiv.org/abs/1802.06865},
-  abstract = {Computer-aided detection or decision support systems aim to improve breast cancer screening programs by helping radiologists to evaluate digital mammography (DM) exams. Commonly such methods proceed in two steps: selection of candidate regions for malignancy, and later classification as either malignant or not. In this study, we present a candidate detection method based on deep learning to automatically detect and additionally segment soft tissue lesions in DM. A database of DM exams (mostly bilateral and two views) was collected from our institutional archive.},
-  file = {:pdf/Moor18a.pdf:PDF},
+  abstract = {Computer-aided detection or decision support systems aim to improve breast cancer screening programs by helping radiologists to evaluate digital mammography (DM) exams. Commonly such methods proceed in two steps: selection of candidate regions for malignancy, and later classification as either malignant or not. In this study, we present a candidate detection method based on deep learning to automatically detect and additionally segment soft tissue lesions in DM. A database of DM exams (mostly bilateral and two views) was collected from our institutional archive. In total, 7196 DM exams (28294 DM images) acquired with systems from three different vendors (General Electric, Siemens, Hologic) were collected, of which 2883 contained malignant lesions verified with histopathology. Data was randomly split on an exam level into training (50%), validation (10%) and testing (40%) of deep neural network with u-net architecture. The u-net classifies the image but also provides lesion segmentation. Free receiver operating characteristic (FROC) analysis was used to evaluate the model, on an image and on an exam level. On an image level, a maximum sensitivity of 0.94 at 7.93 false positives (FP) per image was achieved. Similarly, per exam a maximum sensitivity of 0.98 at 7.81 FP per image was achieved. In conclusion, the method could be used as a candidate selection model with high accuracy and with the additional information of lesion segmentation.},
+  url = {http://dx.doi.org/10.1117/12.2318326},
+  file = {Moor18.pdf:pdf\\Moor18.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
+  journal = {14th International Workshop on Breast Imaging (IWBI 2018)},
+  automatic = {yes},
   all_ss_ids = {['9fca3cab7f8cf1071094591bc13ca53de528dac6']},
-  gscites = {32},
+  citation-count = {16},
 }
 
 @article{Moor2018,
@@ -21866,13 +21898,13 @@ @article{Mord16
   doi = {10.1118/1.4943376},
   url = {http://dx.doi.org/10.1118/1.4943376},
   abstract = {PURPOSE:
-                                                       In the past decades, computer-aided detection (CADe) systems have been developed to aid screening radiologists in the detection of malignant microcalcifications. These systems are useful to avoid perceptual oversights and can increase the radiologists' detection rate. However, due to the high number of false positives marked by these CADe systems, they are not yet suitable as an independent reader. Breast arterial calcifications (BACs) are one of the most frequent false positives marked by CADe systems. In this study, a method is proposed for the elimination of BACs as positive findings. Removal of these false positives will increase the performance of the CADe system in finding malignant microcalcifications.
-                                                       METHODS:
-                                                       A multistage method is proposed for the removal of BAC findings. The first stage consists of a microcalcification candidate selection, segmentation and grouping of the microcalcifications, and classification to remove obvious false positives. In the second stage, a case-based selection is applied where cases are selected which contain BACs. In the final stage, BACs are removed from the selected cases. The BACs removal stage consists of a GentleBoost classifier trained on microcalcification features describing their shape, topology, and texture. Additionally, novel features are introduced to discriminate BACs from other positive findings.
-                                                       RESULTS:
-                                                       The CADe system was evaluated with and without BACs removal. Here, both systems were applied on a validation set containing 1088 cases of which 95 cases contained malignant microcalcifications. After bootstrapping, free-response receiver operating characteristics and receiver operating characteristics analyses were carried out. Performance between the two systems was compared at 0.98 and 0.95 specificity. At a specificity of 0.98, the sensitivity increased from 37% to 52% and the sensitivity increased from 62% up to 76% at a specificity of 0.95. Partial areas under the curve in the specificity range of 0.8-1.0 were significantly different between the system without BACs removal and the system with BACs removal, 0.129 A+- 0.009 versus 0.144 A+- 0.008 (p<0.05), respectively. Additionally, the sensitivity at one false positive per 50 cases and one false positive per 25 cases increased as well, 37% versus 51% (p<0.05) and 58% versus 67% (p<0.05) sensitivity, respectively. Additionally, the CADe system with BACs removal reduces the number of false positives per case by 29% on average. The same sensitivity at one false positive per 50 cases in the CADe system without BACs removal can be achieved at one false positive per 80 cases in the CADe system with BACs removal.
-                                                       CONCLUSIONS:
-                                                       By using dedicated algorithms to detect and remove breast arterial calcifications, the performance of CADe systems can be improved, in particular, at false positive rates representative for operating points used in screening.},
+                                                         In the past decades, computer-aided detection (CADe) systems have been developed to aid screening radiologists in the detection of malignant microcalcifications. These systems are useful to avoid perceptual oversights and can increase the radiologists' detection rate. However, due to the high number of false positives marked by these CADe systems, they are not yet suitable as an independent reader. Breast arterial calcifications (BACs) are one of the most frequent false positives marked by CADe systems. In this study, a method is proposed for the elimination of BACs as positive findings. Removal of these false positives will increase the performance of the CADe system in finding malignant microcalcifications.
+                                                         METHODS:
+                                                         A multistage method is proposed for the removal of BAC findings. The first stage consists of a microcalcification candidate selection, segmentation and grouping of the microcalcifications, and classification to remove obvious false positives. In the second stage, a case-based selection is applied where cases are selected which contain BACs. In the final stage, BACs are removed from the selected cases. The BACs removal stage consists of a GentleBoost classifier trained on microcalcification features describing their shape, topology, and texture. Additionally, novel features are introduced to discriminate BACs from other positive findings.
+                                                         RESULTS:
+                                                         The CADe system was evaluated with and without BACs removal. Here, both systems were applied on a validation set containing 1088 cases of which 95 cases contained malignant microcalcifications. After bootstrapping, free-response receiver operating characteristics and receiver operating characteristics analyses were carried out. Performance between the two systems was compared at 0.98 and 0.95 specificity. At a specificity of 0.98, the sensitivity increased from 37% to 52% and the sensitivity increased from 62% up to 76% at a specificity of 0.95. Partial areas under the curve in the specificity range of 0.8-1.0 were significantly different between the system without BACs removal and the system with BACs removal, 0.129 A+- 0.009 versus 0.144 A+- 0.008 (p<0.05), respectively. Additionally, the sensitivity at one false positive per 50 cases and one false positive per 25 cases increased as well, 37% versus 51% (p<0.05) and 58% versus 67% (p<0.05) sensitivity, respectively. Additionally, the CADe system with BACs removal reduces the number of false positives per case by 29% on average. The same sensitivity at one false positive per 50 cases in the CADe system without BACs removal can be achieved at one false positive per 80 cases in the CADe system with BACs removal.
+                                                         CONCLUSIONS:
+                                                         By using dedicated algorithms to detect and remove breast arterial calcifications, the performance of CADe systems can be improved, in particular, at false positive rates representative for operating points used in screening.},
   file = {:pdf/Mord16.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {27036566},
@@ -22449,7 +22481,7 @@ @article{Murp19
   year = {2019},
   url = {https://arxiv.org/abs/1903.03349},
   abstract = {There is a growing interest in the automated analysis of chest X-Ray (CXR) as a sensitive and inexpensive means of screening susceptible populations for pulmonary tuberculosis. In this work we evaluate the latest version of CAD4TB, a software platform designed for this purpose. Version 6 of CAD4TB was released in 2018 and is here tested on an independent dataset of 5565 CXR images with GeneXpert (Xpert) sputum test results available (854 Xpert positive subjects). A subset of 500 subjects (50% Xpert positive) was reviewed and annotated by 5 expert observers independently to obtain a radiological reference standard. The latest version of CAD4TB is found to outperform all previous versions in terms of area under receiver operating curve (ROC) with respect to both Xpert and radiological reference standards. Improvements with respect to Xpert are most apparent at high sensitivity levels with a specificity of 76% obtained at 90% sensitivity. When compared with the radiological reference standard, CAD4TB v6 also outperformed previous versions by a considerable margin and achieved 98% specificity at 90% sensitivity. No substantial difference was found between the performance of CAD4TB v6 and any of the various expert observers against the Xpert reference standard. A cost and efficiency analysis on this dataset demonstrates that in a standard clinical situation, operating at 90% sensitivity, users of CAD4TB v6 can process 132 subjects per day at an average cost per screen of \$5.95 per sub
-                                                       ject, while users of version 3 process only 85 subjects per day at a cost of \$8.41 per subject. At all tested operating points version 6 is shown to be more efficient and cost effective than any other version.},
+                                                         ject, while users of version 3 process only 85 subjects per day at a cost of \$8.41 per subject. At all tested operating points version 6 is shown to be more efficient and cost effective than any other version.},
   optnote = {DIAG, RADIOLOGY},
   month = {3},
   all_ss_ids = {['61d1507dec6ef557cba474c4f67b1d0ad3631fb0']},
@@ -23301,17 +23333,17 @@ @article{Nist24
   url = {http://dx.doi.org/10.1186/s12903-024-04129-5},
   volume = {24},
   abstract = {Abstract
-                           Objective
-                           Panoramic radiographs (PRs) provide a comprehensive view of the oral and maxillofacial region and are used routinely to assess dental and osseous pathologies. Artificial intelligence (AI) can be used to improve the diagnostic accuracy of PRs compared to bitewings and periapical radiographs. This study aimed to evaluate the advantages and challenges of using publicly available datasets in dental AI research, focusing on solving the novel task of predicting tooth segmentations, FDI numbers, and tooth diagnoses, simultaneously.
-
-                           Materials and methods
-                           Datasets from the OdontoAI platform (tooth instance segmentations) and the DENTEX challenge (tooth bounding boxes with associated diagnoses) were combined to develop a two-stage AI model. The first stage implemented tooth instance segmentation with FDI numbering and extracted regions of interest around each tooth segmentation, whereafter the second stage implemented multi-label classification to detect dental caries, impacted teeth, and periapical lesions in PRs. The performance of the automated tooth segmentation algorithm was evaluated using a free-response receiver-operating-characteristics (FROC) curve and mean average precision (mAP) metrics. The diagnostic accuracy of detection and classification of dental pathology was evaluated with ROC curves and F1 and AUC metrics.
-
-                           Results
-                           The two-stage AI model achieved high accuracy in tooth segmentations with a FROC score of 0.988 and a mAP of 0.848. High accuracy was also achieved in the diagnostic classification of impacted teeth (F1 = 0.901, AUC = 0.996), whereas moderate accuracy was achieved in the diagnostic classification of deep caries (F1 = 0.683, AUC = 0.960), early caries (F1 = 0.662, AUC = 0.881), and periapical lesions (F1 = 0.603, AUC = 0.974). The model's performance correlated positively with the quality of annotations in the used public datasets. Selected samples from the DENTEX dataset revealed cases of missing (false-negative) and incorrect (false-positive) diagnoses, which negatively influenced the performance of the AI model.
-
-                           Conclusions
-                           The use and pooling of public datasets in dental AI research can significantly accelerate the development of new AI models and enable fast exploration of novel tasks. However, standardized quality assurance is essential before using the datasets to ensure reliable outcomes and limit potential biases.},
+                             Objective
+                             Panoramic radiographs (PRs) provide a comprehensive view of the oral and maxillofacial region and are used routinely to assess dental and osseous pathologies. Artificial intelligence (AI) can be used to improve the diagnostic accuracy of PRs compared to bitewings and periapical radiographs. This study aimed to evaluate the advantages and challenges of using publicly available datasets in dental AI research, focusing on solving the novel task of predicting tooth segmentations, FDI numbers, and tooth diagnoses, simultaneously.
+  
+                             Materials and methods
+                             Datasets from the OdontoAI platform (tooth instance segmentations) and the DENTEX challenge (tooth bounding boxes with associated diagnoses) were combined to develop a two-stage AI model. The first stage implemented tooth instance segmentation with FDI numbering and extracted regions of interest around each tooth segmentation, whereafter the second stage implemented multi-label classification to detect dental caries, impacted teeth, and periapical lesions in PRs. The performance of the automated tooth segmentation algorithm was evaluated using a free-response receiver-operating-characteristics (FROC) curve and mean average precision (mAP) metrics. The diagnostic accuracy of detection and classification of dental pathology was evaluated with ROC curves and F1 and AUC metrics.
+  
+                             Results
+                             The two-stage AI model achieved high accuracy in tooth segmentations with a FROC score of 0.988 and a mAP of 0.848. High accuracy was also achieved in the diagnostic classification of impacted teeth (F1 = 0.901, AUC = 0.996), whereas moderate accuracy was achieved in the diagnostic classification of deep caries (F1 = 0.683, AUC = 0.960), early caries (F1 = 0.662, AUC = 0.881), and periapical lesions (F1 = 0.603, AUC = 0.974). The model's performance correlated positively with the quality of annotations in the used public datasets. Selected samples from the DENTEX dataset revealed cases of missing (false-negative) and incorrect (false-positive) diagnoses, which negatively influenced the performance of the AI model.
+  
+                             Conclusions
+                             The use and pooling of public datasets in dental AI research can significantly accelerate the development of new AI models and enable fast exploration of novel tasks. However, standardized quality assurance is essential before using the datasets to ensure reliable outcomes and limit potential biases.},
   all_ss_ids = {['f29852db58fad3040c0bacc0303c3fc64f9c4897']},
   automatic = {yes},
   citation-count = {0},
@@ -23358,7 +23390,7 @@ @article{Noor23
   url = {http://dx.doi.org/10.1186/s41747-023-00372-7},
   volume = {7},
   abstract = {AbstractArtificial intelligence has opened a new path of innovation in magnetic resonance (MR) image reconstruction of undersampled k-space acquisitions. This review offers readers an analysis of the current deep learning-based MR image reconstruction methods. The literature in this field shows exponential growth, both in volume and complexity, as the capabilities of machine learning in solving inverse problems such as image reconstruction are explored. We review the latest developments, aiming to assist researchers and radiologists who are developing new methods or seeking to provide valuable feedback. We shed light on key concepts by exploring the technical intricacies of MR image reconstruction, highlighting the importance of raw datasets and the difficulty of evaluating diagnostic value using standard metrics.Relevance statement Increasingly complex algorithms output reconstructed images that are difficult to assess for robustness and diagnostic quality, necessitating high-quality datasets and collaboration with radiologists.Key points* Deep learning-based image reconstruction algorithms are increasing both in complexity and performance.* The evaluation of reconstructed images may mistake perceived image quality for diagnostic value.* Collaboration with radiologists is crucial for advancing deep learning technology.
-                                         Graphical Abstract},
+                                           Graphical Abstract},
   all_ss_ids = {[03a7b0c8377b4cfd5bd37215d644d31033b7ae23]},
   automatic = {yes},
   citation-count = {0},
@@ -23375,12 +23407,12 @@ @article{Noot22
   title = {Knowledge distillation with ensembles of convolutional neural networks for medical image segmentation},
   doi = {https://doi.org/10.1117/1.JMI.9.5.052407},
   abstract = {Purpose: Ensembles of convolutional neural networks (CNNs) often outperform a single CNN in medical image segmentation tasks, but inference is computationally more expensive and makes ensembles unattractive for some applications. We compared the performance of differently constructed ensembles with the performance of CNNs derived from these ensembles using knowledge distillation, a technique for reducing the footprint of large models such as ensembles.
-
-                            Approach: We investigated two different types of ensembles, namely, diverse ensembles of networks with three different architectures and two different loss-functions, and uniform ensembles of networks with the same architecture but initialized with different random seeds. For each ensemble, additionally, a single student network was trained to mimic the class probabilities predicted by the teacher model, the ensemble. We evaluated the performance of each network, the ensembles, and the corresponding distilled networks across three different publicly available datasets. These included chest computed tomography scans with four annotated organs of interest, brain magnetic resonance imaging (MRI) with six annotated brain structures, and cardiac cine-MRI with three annotated heart structures.
-
-                            Results: Both uniform and diverse ensembles obtained better results than any of the individual networks in the ensemble. Furthermore, applying knowledge distillation resulted in a single network that was smaller and faster without compromising performance compared with the ensemble it learned from. The distilled networks significantly outperformed the same network trained with reference segmentation instead of knowledge distillation.
-
-                            Conclusion: Knowledge distillation can compress segmentation ensembles of uniform or diverse composition into a single CNN while maintaining the performance of the ensemble.},
+  
+                              Approach: We investigated two different types of ensembles, namely, diverse ensembles of networks with three different architectures and two different loss-functions, and uniform ensembles of networks with the same architecture but initialized with different random seeds. For each ensemble, additionally, a single student network was trained to mimic the class probabilities predicted by the teacher model, the ensemble. We evaluated the performance of each network, the ensembles, and the corresponding distilled networks across three different publicly available datasets. These included chest computed tomography scans with four annotated organs of interest, brain magnetic resonance imaging (MRI) with six annotated brain structures, and cardiac cine-MRI with three annotated heart structures.
+  
+                              Results: Both uniform and diverse ensembles obtained better results than any of the individual networks in the ensemble. Furthermore, applying knowledge distillation resulted in a single network that was smaller and faster without compromising performance compared with the ensemble it learned from. The distilled networks significantly outperformed the same network trained with reference segmentation instead of knowledge distillation.
+  
+                              Conclusion: Knowledge distillation can compress segmentation ensembles of uniform or diverse composition into a single CNN while maintaining the performance of the ensemble.},
   file = {Noot22.pdf:pdf\\Noot22.pdf:PDF},
   journal = {Journal of Medical Imaging},
   month = {05},
@@ -23396,10 +23428,10 @@ @conference{Obre24
   booktitle = ECR,
   title = {Deep Learning for estimating pulmonary nodule malignancy risk: How much data does AI need to reach radiologist level performance?},
   abstract = {Deep learning algorithms require large training datasets to achieve optimal performance. For many AI tasks, it is unclear whether algorithm performance would improve further if more training data was added. The aim of this study is to quantify the number of CT training samples required to achieve radiologist-level performance for a deep learning AI algorithm that estimates pulmonary nodule malignancy risk.
-           Methods and materials: For estimating pulmonary nodule malignancy risk, we used the NLST dataset (malignant nodules:1249, benign nodules:14828) to train a deep learning algorithm. The dataset was split: 80% training and 20% internal validation. The algorithm was trained on random subsets of the training set with subset sizes ranging from 10% to 100%, with a class distribution of malignant7.77% and benign92.23%. The trained AI algorithms were validated on a size-matched cancer-enriched cohort (malignant:59, benign:118) from DLCST. The performance was compared against a group of 11 clinicians that also scored the test set, which included 4 thoracic radiologists.
-           Results: Using training data subsets of 10%, 20%, and 30%, the AI achieved AUC values of 0.74 (95%CI:0.67-0.82), 0.79 (95%CI:0.72-0.85), and 0.81 (95%CI:0.74-0.87) respectively. When the training data set size reached 60% (malignant:602, benign:7112), the performance saturated, reaching an AUC of 0.82 (95%CI:0.75-0.88). This was comparable to the average AUC of all clinicians (0.82,95%CI:0.77-0.86,p>0.99) and of the 4 thoracic radiologists (0.82,95%CI:0.74-0.89,p>0.99).
-           Conclusion: The AI was able to reach the level of an experienced thoracic radiologist when it was trained on 7714 nodules (malignant:602) from the NLST dataset. These findings have potential implications for the allocation of resources in developing deep learning algorithms for lung cancer medical imaging diagnostics.
-           Limitations: The generalizability of these findings is constrained by heterogeneity and geographical limitations of the datasets used in this study.},
+             Methods and materials: For estimating pulmonary nodule malignancy risk, we used the NLST dataset (malignant nodules:1249, benign nodules:14828) to train a deep learning algorithm. The dataset was split: 80% training and 20% internal validation. The algorithm was trained on random subsets of the training set with subset sizes ranging from 10% to 100%, with a class distribution of malignant7.77% and benign92.23%. The trained AI algorithms were validated on a size-matched cancer-enriched cohort (malignant:59, benign:118) from DLCST. The performance was compared against a group of 11 clinicians that also scored the test set, which included 4 thoracic radiologists.
+             Results: Using training data subsets of 10%, 20%, and 30%, the AI achieved AUC values of 0.74 (95%CI:0.67-0.82), 0.79 (95%CI:0.72-0.85), and 0.81 (95%CI:0.74-0.87) respectively. When the training data set size reached 60% (malignant:602, benign:7112), the performance saturated, reaching an AUC of 0.82 (95%CI:0.75-0.88). This was comparable to the average AUC of all clinicians (0.82,95%CI:0.77-0.86,p>0.99) and of the 4 thoracic radiologists (0.82,95%CI:0.74-0.89,p>0.99).
+             Conclusion: The AI was able to reach the level of an experienced thoracic radiologist when it was trained on 7714 nodules (malignant:602) from the NLST dataset. These findings have potential implications for the allocation of resources in developing deep learning algorithms for lung cancer medical imaging diagnostics.
+             Limitations: The generalizability of these findings is constrained by heterogeneity and geographical limitations of the datasets used in this study.},
   optnote = {DIAG, RADIOLOGY},
   year = {2024},
 }
@@ -23499,12 +23531,12 @@ @article{Oei18
   pages = {3902-3911},
   doi = {10.1007/s00330-018-5353-y},
   abstract = {Objectives: To assess observer variability of different reference tissues used for relative CBV (rCBV) measurements in DSC-MRI of glioma patients.
-
-                                                       Methods: In this retrospective study, three observers measured rCBVin DSC-MRimages of 44 glioma patients on two occasions. rCBVis calculated by the CBVin the tumour hotspot/the CBVof a reference tissue at the contralateral side for normalization. One observer annotated the tumour hotspot that was kept constant for all measurements. All observers annotated eight reference tissues of normal white and grey matter. Observer variability was evaluated using the intraclass correlation coefficient (ICC), coefficient of variation (CV) and Bland-Altman analyses.
-
-                                                       Results: For intra-observer, the ICC ranged from 0.50-0.97 (fair-excellent) for all reference tissues. The CV ranged from 5.1-22.1 % for all reference tissues and observers. For inter-observer, the ICC for all pairwise observer combinations ranged from 0.44-0.92 (poor-excellent). The CV ranged from 8.1-31.1 %. Centrum semiovale was the only reference tissue that showed excellent intra- and inter-observer agreement (ICC>0.85) and lowest CVs (<12.5 %). Bland-Altman analyses showed that mean differences for centrum semiovale were close to zero.
-
-                                                       Conclusion: Selecting contralateral centrum semiovale as reference tissue for rCBV provides the lowest observer variability.},
+  
+                                                         Methods: In this retrospective study, three observers measured rCBVin DSC-MRimages of 44 glioma patients on two occasions. rCBVis calculated by the CBVin the tumour hotspot/the CBVof a reference tissue at the contralateral side for normalization. One observer annotated the tumour hotspot that was kept constant for all measurements. All observers annotated eight reference tissues of normal white and grey matter. Observer variability was evaluated using the intraclass correlation coefficient (ICC), coefficient of variation (CV) and Bland-Altman analyses.
+  
+                                                         Results: For intra-observer, the ICC ranged from 0.50-0.97 (fair-excellent) for all reference tissues. The CV ranged from 5.1-22.1 % for all reference tissues and observers. For inter-observer, the ICC for all pairwise observer combinations ranged from 0.44-0.92 (poor-excellent). The CV ranged from 8.1-31.1 %. Centrum semiovale was the only reference tissue that showed excellent intra- and inter-observer agreement (ICC>0.85) and lowest CVs (<12.5 %). Bland-Altman analyses showed that mean differences for centrum semiovale were close to zero.
+  
+                                                         Conclusion: Selecting contralateral centrum semiovale as reference tissue for rCBV provides the lowest observer variability.},
   file = {:Oei18 - Observer Variability of Reference Tissue Selection for Relative Cerebral Blood Volume Measurements in Glioma Patients.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {29572637},
@@ -23554,17 +23586,17 @@ @article{Ogon22
   url = {http://dx.doi.org/10.1186/s13058-022-01541-z},
   volume = {24},
   abstract = {Abstract
-                                         Background
-                                         Breast terminal duct lobular units (TDLUs), the source of most breast cancer (BC) precursors, are shaped by age-related involution, a gradual process, and postpartum involution (PPI), a dramatic inflammatory process that restores baseline microanatomy after weaning. Dysregulated PPI is implicated in the pathogenesis of postpartum BCs. We propose that assessment of TDLUs in the postpartum period may have value in risk estimation, but characteristics of these tissues in relation to epidemiological factors are incompletely described.
-
-                                         Methods
-                                         Using validated Artificial Intelligence and morphometric methods, we analyzed digitized images of tissue sections of normal breast tissues stained with hematoxylin and eosin from donors <= 45 years from the Komen Tissue Bank (180 parous and 545 nulliparous). Metrics assessed by AI, included: TDLU count; adipose tissue fraction; mean acini count/TDLU; mean dilated acini; mean average acini area; mean "capillary" area; mean epithelial area; mean ratio of epithelial area versus intralobular stroma; mean mononuclear cell count (surrogate of immune cells); mean fat area proximate to TDLUs and TDLU area. We compared epidemiologic characteristics collected via questionnaire by parity status and race, using a Wilcoxon rank sum test or Fisher's exact test. Histologic features were compared between nulliparous and parous women (overall and by time between last birth and donation [recent birth: <= 5 years versus remote birth: &gt; 5 years]) using multivariable regression models.
-
-                                         Results
-                                         Normal breast tissues of parous women contained significantly higher TDLU counts and acini counts, more frequent dilated acini, higher mononuclear cell counts in TDLUs and smaller acini area per TDLU than nulliparas (all multivariable analyses p &lt; 0.001). Differences in TDLU counts and average acini size persisted for &gt; 5 years postpartum, whereas increases in immune cells were most marked <= 5 years of a birth. Relationships were suggestively modified by several other factors, including demographic and reproductive characteristics, ethanol consumption and breastfeeding duration.
-
-                                         Conclusions
-                                         Our study identified sustained expansion of TDLU numbers and reduced average acini area among parous versus nulliparous women and notable increases in immune responses within five years following childbirth. Further, we show that quantitative characteristics of normal breast samples vary with demographic features and BC risk factors.},
+                                           Background
+                                           Breast terminal duct lobular units (TDLUs), the source of most breast cancer (BC) precursors, are shaped by age-related involution, a gradual process, and postpartum involution (PPI), a dramatic inflammatory process that restores baseline microanatomy after weaning. Dysregulated PPI is implicated in the pathogenesis of postpartum BCs. We propose that assessment of TDLUs in the postpartum period may have value in risk estimation, but characteristics of these tissues in relation to epidemiological factors are incompletely described.
+  
+                                           Methods
+                                           Using validated Artificial Intelligence and morphometric methods, we analyzed digitized images of tissue sections of normal breast tissues stained with hematoxylin and eosin from donors <= 45 years from the Komen Tissue Bank (180 parous and 545 nulliparous). Metrics assessed by AI, included: TDLU count; adipose tissue fraction; mean acini count/TDLU; mean dilated acini; mean average acini area; mean "capillary" area; mean epithelial area; mean ratio of epithelial area versus intralobular stroma; mean mononuclear cell count (surrogate of immune cells); mean fat area proximate to TDLUs and TDLU area. We compared epidemiologic characteristics collected via questionnaire by parity status and race, using a Wilcoxon rank sum test or Fisher's exact test. Histologic features were compared between nulliparous and parous women (overall and by time between last birth and donation [recent birth: <= 5 years versus remote birth: &gt; 5 years]) using multivariable regression models.
+  
+                                           Results
+                                           Normal breast tissues of parous women contained significantly higher TDLU counts and acini counts, more frequent dilated acini, higher mononuclear cell counts in TDLUs and smaller acini area per TDLU than nulliparas (all multivariable analyses p &lt; 0.001). Differences in TDLU counts and average acini size persisted for &gt; 5 years postpartum, whereas increases in immune cells were most marked <= 5 years of a birth. Relationships were suggestively modified by several other factors, including demographic and reproductive characteristics, ethanol consumption and breastfeeding duration.
+  
+                                           Conclusions
+                                           Our study identified sustained expansion of TDLU numbers and reduced average acini area among parous versus nulliparous women and notable increases in immune responses within five years following childbirth. Further, we show that quantitative characteristics of normal breast samples vary with demographic features and BC risk factors.},
   all_ss_ids = {['fc34292163822dca66f4f284ceb0e5cb689727f5', '70eaa7319f839839bd90576e5a64fae2f4b11fb5']},
   automatic = {yes},
   citation-count = {1},
@@ -23583,14 +23615,14 @@ @article{Olac20
   pages = {124-131},
   volume = {71},
   abstract = {Purpose:
-                                                       EPID dosimetry in the Unity MR-Linac system allows for reconstruction of absolute dose distributions within the patient geometry. Dose reconstruction is accurate for the parts of the beam arriving at the EPID through the MRI central unattenuated region, free of gradient coils, resulting in a maximum field size of ~10x22 cm2 at isocentre. The purpose of this study is to develop a Deep Learning-based method to improve the accuracy of 2D EPID reconstructed dose distributions outside this central region, accounting for the effects of the extra attenuation and scatter.
-
-                                                       Methods:
-                                                       A U-Net was trained to correct EPID dose images calculated at the isocenter inside a cylindrical phantom using the corresponding TPS dose images as ground truth for training. The model was evaluated using a 5-fold cross validation procedure. The clinical validity of the U-Net corrected dose images (the so-called DEEPID dose images) was assessed with in vivo verification data of 45 large rectum IMRT fields. The sensitivity of DEEPID to leaf bank position errors (+-1.5 mm) and +-5% MU delivery errors was also tested.
-
-                                                       Results:
-                                                       Compared to the TPS, in vivo 2D DEEPID dose images showed an average g-pass rate of 90.2% (72.6%-99.4%) outside the central unattenuated region. Without DEEPID correction, this number was 44.5% (4.0%-78.4%). DEEPID correctly detected the introduced delivery errors .
-                                                       Conclusions: DEEPID allows for accurate dose reconstruction using the entire EPID image, thus enabling dosimetric verification for field sizes up to ~19x22 cm2 at isocentre. The method can be used to detect clinically relevant errors.},
+                                                         EPID dosimetry in the Unity MR-Linac system allows for reconstruction of absolute dose distributions within the patient geometry. Dose reconstruction is accurate for the parts of the beam arriving at the EPID through the MRI central unattenuated region, free of gradient coils, resulting in a maximum field size of ~10x22 cm2 at isocentre. The purpose of this study is to develop a Deep Learning-based method to improve the accuracy of 2D EPID reconstructed dose distributions outside this central region, accounting for the effects of the extra attenuation and scatter.
+  
+                                                         Methods:
+                                                         A U-Net was trained to correct EPID dose images calculated at the isocenter inside a cylindrical phantom using the corresponding TPS dose images as ground truth for training. The model was evaluated using a 5-fold cross validation procedure. The clinical validity of the U-Net corrected dose images (the so-called DEEPID dose images) was assessed with in vivo verification data of 45 large rectum IMRT fields. The sensitivity of DEEPID to leaf bank position errors (+-1.5 mm) and +-5% MU delivery errors was also tested.
+  
+                                                         Results:
+                                                         Compared to the TPS, in vivo 2D DEEPID dose images showed an average g-pass rate of 90.2% (72.6%-99.4%) outside the central unattenuated region. Without DEEPID correction, this number was 44.5% (4.0%-78.4%). DEEPID correctly detected the introduced delivery errors .
+                                                         Conclusions: DEEPID allows for accurate dose reconstruction using the entire EPID image, thus enabling dosimetric verification for field sizes up to ~19x22 cm2 at isocentre. The method can be used to detect clinically relevant errors.},
   file = {Olac20.pdf:pdf\\Olac20.pdf:PDF},
   journal = PHYSMED,
   optnote = {DIAG, RADIOLOGY},
@@ -23730,8 +23762,8 @@ @mastersthesis{Oude19
   title = {Reversible Networks for Memory-efficient Image-to-Image Translation in 3D Medical Imaging},
   year = {2019},
   abstract = {The Pix2pix and CycleGAN losses have vastly improved the qualitative and quantitative visual quality of results in image-to-image translation tasks. We extend this framework by exploring approximately invertible architectures which are well suited to these losses. These architectures are approximately invertible by design and thus partially satisfy cycle-consistency before training even begins. Furthermore, since invertible architectures have constant memory complexity in depth, these models can be built arbitrarily deep. We are able to demonstrate superior quantitative output on the Cityscapes and Maps datasets.
-
-                                                       Additionally, we show that the model allows us to perform several memory-intensive medical imaging tasks, including a super-resolution problem on 3D MRI brain volumes. We also demonstrate that our model can perform a 3D domain-adaptation and 3D super-resolution task on chest CT volumes. By doing this, we provide a proof-of-principle for using reversible networks to create a model capable of pre-processing 3D CT scans to high resolution with a standardized appearance.},
+  
+                                                         Additionally, we show that the model allows us to perform several memory-intensive medical imaging tasks, including a super-resolution problem on 3D MRI brain volumes. We also demonstrate that our model can perform a 3D domain-adaptation and 3D super-resolution task on chest CT volumes. By doing this, we provide a proof-of-principle for using reversible networks to create a model capable of pre-processing 3D CT scans to high resolution with a standardized appearance.},
   file = {Oude19.pdf:pdf/Oude19.pdf:PDF},
   optnote = {DIAG},
   school = {University of Amsterdam},
@@ -23814,13 +23846,13 @@ @conference{Pate17
   booktitle = ECR,
   year = {2017},
   abstract = {PURPOSE
-                                                       Cranial cavity segmentation in CT is the essential first step for subsequent image processing and automated detection of cerebral pathology. This becomes complicated in the presence of skull fractures, metallic foreign objects or due to connected soft tissues such as the orbit. A robust and accurate method is presented to segment the cranial cavity in CT images.
-                                                       METHOD AND MATERIALS
-                                                       We propose a multi-atlas based method that uses atlas selection based on anterior skull variations, followed by a two-stage levelset refinement. The method was developed using a set of 99 non-contrast CT and 18 CT perfusion (CTP) scans obtained for emergency indications on a 320-row detector CT scanner. It was evaluated on a different set of 200 non-contrast CT and 100 CTP scans obtained for the same indications. Quality of segmentations was visually assessed. The reference standard consisted of three randomly selected orthogonal slices per patient that were manually annotated by trained observers. The corresponding slices were extracted and compared to the reference standard. Dice similarity coefficient (DSC) and 95th percentile Hausdorff distance (95% HD) were reported.
-                                                       RESULTS
-                                                       The segmentation results were evaluated as very good to excellent. The method achieved a mean DSC of 0.98 +- 0.03 and mean 95% HD of 0.60 +- 2.15 mm in comparison to the reference standard.
-                                                       CONCLUSION
-                                                       The proposed method is capable of accurate segmentation of the cranial cavity in non-contrast CT and CTP independent of gross pathology or foreign objects. The method provides a fundamental first step towards automated evaluation of cranial CT.},
+                                                         Cranial cavity segmentation in CT is the essential first step for subsequent image processing and automated detection of cerebral pathology. This becomes complicated in the presence of skull fractures, metallic foreign objects or due to connected soft tissues such as the orbit. A robust and accurate method is presented to segment the cranial cavity in CT images.
+                                                         METHOD AND MATERIALS
+                                                         We propose a multi-atlas based method that uses atlas selection based on anterior skull variations, followed by a two-stage levelset refinement. The method was developed using a set of 99 non-contrast CT and 18 CT perfusion (CTP) scans obtained for emergency indications on a 320-row detector CT scanner. It was evaluated on a different set of 200 non-contrast CT and 100 CTP scans obtained for the same indications. Quality of segmentations was visually assessed. The reference standard consisted of three randomly selected orthogonal slices per patient that were manually annotated by trained observers. The corresponding slices were extracted and compared to the reference standard. Dice similarity coefficient (DSC) and 95th percentile Hausdorff distance (95% HD) were reported.
+                                                         RESULTS
+                                                         The segmentation results were evaluated as very good to excellent. The method achieved a mean DSC of 0.98 +- 0.03 and mean 95% HD of 0.60 +- 2.15 mm in comparison to the reference standard.
+                                                         CONCLUSION
+                                                         The proposed method is capable of accurate segmentation of the cranial cavity in non-contrast CT and CTP independent of gross pathology or foreign objects. The method provides a fundamental first step towards automated evaluation of cranial CT.},
   optnote = {DIAG},
 }
 
@@ -23911,15 +23943,15 @@ @article{Pate19a
   pages = {17858},
   doi = {10.1038/s41598-019-54491-6},
   abstract = {A 3-dimensional (3D) convolutional neural network is presented for the segmentation and quantification of spontaneous
-                                                       intracerebral haemorrhage (ICH) in non-contrast computed tomography (NCCT). The method utilises a combination of
-                                                       contextual information on multiple scales for fast and fully automatic dense predictions. To handle a large class imbalance
-                                                       present in the data, a weight map is introduced during training. The method was evaluated on two datasets of 25 and 50
-                                                       patients respectively. The reference standard consisted of manual annotations for each ICH in the dataset. Quantitative
-                                                       analysis showed a median Dice similarity coefficient of 0.91 [0.87 - 0.94] and 0.90 [0.85 - 0.92] for the two test datasets in
-                                                       comparison to the reference standards. Evaluation of a separate dataset of 5 patients for the assessment of the observer
-                                                       variability produced a mean Dice similarity coefficient of 0.95 +/- 0.02 for the inter-observer variability and 0.97 +/- 0.01 for the
-                                                       intra-observer variability. The average prediction time for an entire volume was 104 +/- 15 seconds. The results demonstrate
-                                                       that the method is accurate and approaches the performance of expert manual annotation.},
+                                                         intracerebral haemorrhage (ICH) in non-contrast computed tomography (NCCT). The method utilises a combination of
+                                                         contextual information on multiple scales for fast and fully automatic dense predictions. To handle a large class imbalance
+                                                         present in the data, a weight map is introduced during training. The method was evaluated on two datasets of 25 and 50
+                                                         patients respectively. The reference standard consisted of manual annotations for each ICH in the dataset. Quantitative
+                                                         analysis showed a median Dice similarity coefficient of 0.91 [0.87 - 0.94] and 0.90 [0.85 - 0.92] for the two test datasets in
+                                                         comparison to the reference standards. Evaluation of a separate dataset of 5 patients for the assessment of the observer
+                                                         variability produced a mean Dice similarity coefficient of 0.95 +/- 0.02 for the inter-observer variability and 0.97 +/- 0.01 for the
+                                                         intra-observer variability. The average prediction time for an entire volume was 104 +/- 15 seconds. The results demonstrate
+                                                         that the method is accurate and approaches the performance of expert manual annotation.},
   file = {:pdf/Pate19a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {31780815},
@@ -23990,10 +24022,10 @@ @conference{Peet23
   booktitle = ECR,
   title = {The effect of applying an uncertainty estimation method on the performance of a deep learning model for nodule malignancy risk estimation},
   abstract = {Purpose: Artificial Intelligence (AI) algorithms often lack uncertainty estimation for classification tasks. Uncertainty estimation may however be an important requirement for clinical adoption of AI algorithms. In this study, we integrate a method for uncertainty estimation into a previously developed AI algorithm and investigate the performance when applying different uncertainty thresholds.
-                         Methods and materials: We used a retrospective external validation dataset from the Danish Lung Cancer Screening Trial, containing 818 benign and 65 malignant nodules. Our previously developed AI algorithm for nodule malignancy risk estimation was extended with a method for measuring the prediction uncertainty. The uncertainty score (UnS) was calculated by measuring the standard deviation over 20 different predictions of an ensemble of AI models. Two UnS thresholds at the 90th and 95th percentile were applied to retain 90% and 95% of all cases as certain, respectively. For these scenarios, we calculated the area under the ROC curve (AUC) for certain and uncertain cases, and for the full set of nodules.
-                         Results: On the full set of 883 nodules, the AUC of the AI risk score was 0.932. For the 90th and 95th percentile, the AUC of the AI risk score for certain cases was 0.934 and 0.935, respectively, and for the uncertain cases was 0.710 and 0.688, respectively.
-                         Conclusion: In this retrospective data set, we demonstrate that integrating an uncertainty estimation method into a deep learning-based nodule malignancy risk estimation algorithm slightly increased the performance on certain cases. The AI performance is substantially worse on uncertain cases and therefore in need of human visual review.
-                         Limitations: This study is a retrospective analysis on data from one single lung cancer screening trial. More external validation is needed.},
+                           Methods and materials: We used a retrospective external validation dataset from the Danish Lung Cancer Screening Trial, containing 818 benign and 65 malignant nodules. Our previously developed AI algorithm for nodule malignancy risk estimation was extended with a method for measuring the prediction uncertainty. The uncertainty score (UnS) was calculated by measuring the standard deviation over 20 different predictions of an ensemble of AI models. Two UnS thresholds at the 90th and 95th percentile were applied to retain 90% and 95% of all cases as certain, respectively. For these scenarios, we calculated the area under the ROC curve (AUC) for certain and uncertain cases, and for the full set of nodules.
+                           Results: On the full set of 883 nodules, the AUC of the AI risk score was 0.932. For the 90th and 95th percentile, the AUC of the AI risk score for certain cases was 0.934 and 0.935, respectively, and for the uncertain cases was 0.710 and 0.688, respectively.
+                           Conclusion: In this retrospective data set, we demonstrate that integrating an uncertainty estimation method into a deep learning-based nodule malignancy risk estimation algorithm slightly increased the performance on certain cases. The AI performance is substantially worse on uncertain cases and therefore in need of human visual review.
+                           Limitations: This study is a retrospective analysis on data from one single lung cancer screening trial. More external validation is needed.},
   optnote = {DIAG, RADIOLOGY},
   year = {2023},
 }
@@ -24004,25 +24036,25 @@ @article{Peet24
   doi = {10.1007/s00330-024-10714-7},
   url = {http://dx.doi.org/10.1007/s00330-024-10714-7},
   abstract = {Abstract
-                         Objective
-                         To investigate the effect of uncertainty estimation on the performance of a Deep Learning (DL) algorithm for estimating malignancy risk of pulmonary nodules.
-
-                         Methods and materials
-                         In this retrospective study, we integrated an uncertainty estimation method into a previously developed DL algorithm for nodule malignancy risk estimation. Uncertainty thresholds were developed using CT data from the Danish Lung Cancer Screening Trial (DLCST), containing 883 nodules (65 malignant) collected between 2004 and 2010. We used thresholds on the 90th and 95th percentiles of the uncertainty score distribution to categorize nodules into certain and uncertain groups. External validation was performed on clinical CT data from a tertiary academic center containing 374 nodules (207 malignant) collected between 2004 and 2012. DL performance was measured using area under the ROC curve (AUC) for the full set of nodules, for the certain cases and for the uncertain cases. Additionally, nodule characteristics were compared to identify trends for inducing uncertainty.
-
-                         Results
-                         The DL algorithm performed significantly worse in the uncertain group compared to the certain group of DLCST (AUC 0.62 (95% CI: 0.49, 0.76) vs 0.93 (95% CI: 0.88, 0.97); p < .001) and the clinical dataset (AUC 0.62 (95% CI: 0.50, 0.73) vs 0.90 (95% CI: 0.86, 0.94); p < .001). The uncertain group included larger benign nodules as well as more part-solid and non-solid nodules than the certain group.
-
-                         Conclusion
-                         The integrated uncertainty estimation showed excellent performance for identifying uncertain cases in which the DL-based nodule malignancy risk estimation algorithm had significantly worse performance.
-
-                         Clinical relevance statement
-                         Deep Learning algorithms often lack the ability to gauge and communicate uncertainty. For safe clinical implementation, uncertainty estimation is of pivotal importance to identify cases where the deep learning algorithm harbors doubt in its prediction.
-
-                         Key Points
-                         * Deep learning (DL) algorithms often lack uncertainty estimation, which potentially reduce the risk of errors and improve safety during clinical adoption of the DL algorithm.
-                         * Uncertainty estimation identifies pulmonary nodules in which the discriminative performance of the DL algorithm is significantly worse.
-                         * Uncertainty estimation can further enhance the benefits of the DL algorithm and improve its safety and trustworthiness.},
+                           Objective
+                           To investigate the effect of uncertainty estimation on the performance of a Deep Learning (DL) algorithm for estimating malignancy risk of pulmonary nodules.
+  
+                           Methods and materials
+                           In this retrospective study, we integrated an uncertainty estimation method into a previously developed DL algorithm for nodule malignancy risk estimation. Uncertainty thresholds were developed using CT data from the Danish Lung Cancer Screening Trial (DLCST), containing 883 nodules (65 malignant) collected between 2004 and 2010. We used thresholds on the 90th and 95th percentiles of the uncertainty score distribution to categorize nodules into certain and uncertain groups. External validation was performed on clinical CT data from a tertiary academic center containing 374 nodules (207 malignant) collected between 2004 and 2012. DL performance was measured using area under the ROC curve (AUC) for the full set of nodules, for the certain cases and for the uncertain cases. Additionally, nodule characteristics were compared to identify trends for inducing uncertainty.
+  
+                           Results
+                           The DL algorithm performed significantly worse in the uncertain group compared to the certain group of DLCST (AUC 0.62 (95% CI: 0.49, 0.76) vs 0.93 (95% CI: 0.88, 0.97); p < .001) and the clinical dataset (AUC 0.62 (95% CI: 0.50, 0.73) vs 0.90 (95% CI: 0.86, 0.94); p < .001). The uncertain group included larger benign nodules as well as more part-solid and non-solid nodules than the certain group.
+  
+                           Conclusion
+                           The integrated uncertainty estimation showed excellent performance for identifying uncertain cases in which the DL-based nodule malignancy risk estimation algorithm had significantly worse performance.
+  
+                           Clinical relevance statement
+                           Deep Learning algorithms often lack the ability to gauge and communicate uncertainty. For safe clinical implementation, uncertainty estimation is of pivotal importance to identify cases where the deep learning algorithm harbors doubt in its prediction.
+  
+                           Key Points
+                           * Deep learning (DL) algorithms often lack uncertainty estimation, which potentially reduce the risk of errors and improve safety during clinical adoption of the DL algorithm.
+                           * Uncertainty estimation identifies pulmonary nodules in which the discriminative performance of the DL algorithm is significantly worse.
+                           * Uncertainty estimation can further enhance the benefits of the DL algorithm and improve its safety and trustworthiness.},
   all_ss_ids = {31a2c0b7e1cf99e9ae5825606a66c00a3a0b1f8e},
   automatic = {yes},
   citation-count = {0},
@@ -24038,13 +24070,13 @@ @conference{Peet24a
   booktitle = ESTI,
   title = {Towards safe and reliable implementation of AI models for nodule malignancy estimation using distance-based out-of-distribution detection},
   abstract = {Purpose:
-       Artificial Intelligence (AI) models may fail or suffer from reduced performance when applied to unseen data that differs from the training data distribution. Automatic detection of out-of-distribution (OOD) data helps to ensure safe and reliable clinical implementation of AI models. In this study, we integrate different OOD detection methods into a previously developed AI model for nodule malignancy risk estimation and evaluate their performance for OOD detection.
-       Methods and materials:
-       We used retrospective datasets from three sources: the National Lung Cancer Screening Trial (NLST, 16077 nodules with 1249 malignant), the Danish Lung Cancer Screening Trial (DLCST, 883 nodules with 65 malignant) and Clinical Routine data from a Dutch academic hospital (374 nodules with 207 malignant). NLST represents in-distribution data since it was used in the development of the AI model. DLCST, also comprising screening data, is categorized as near-OOD data. Clinical Routine data represents far-OOD data because of its diversity in CT protocols and disease incidence. We integrated the following three techniques into our AI model for malignancy risk estimation to calculate OOD scores for all nodules: maximum softmax probability (MSP), energy scoring (ES), and mahalanobis distance (MD) between the features of a test sample and the features of in-distribution samples. MSP takes the highest softmax output probability, while ES computes the log of the summed exponential values from the softmax output. MSP and ES exploit lower confidence in softmax outputs for OOD data. By categorizing NLST as in-distribution, and DLCST and Clinical routine as OOD, we assessed OOD detection performance using area under the ROC curve (AUC). For this, NLST was treated as negative samples, while DLCST and Clinical Routine were treated as positive samples.
-       Results:
-       For DLCST and Clinical Routine, the OOD methods based on MSP and ES showed a moderate ability to separate the data from NLST data with AUCs of 0.53 and 0.66, respectively. The OOD detection method based on MD demonstrated outstanding performance, achieving AUCs of 0.99 and 1.00, respectively.
-       Conclusion:
-       The MD-based OOD detection approach can be seamlessly integrated in an existing AI model and demonstrated to successfully detect far-OOD and near-OOD data. Integration of this approach could be a helpful tool to limit the AI model from failing silently on unseen and abnormal data, thereby enhancing patient safety.},
+         Artificial Intelligence (AI) models may fail or suffer from reduced performance when applied to unseen data that differs from the training data distribution. Automatic detection of out-of-distribution (OOD) data helps to ensure safe and reliable clinical implementation of AI models. In this study, we integrate different OOD detection methods into a previously developed AI model for nodule malignancy risk estimation and evaluate their performance for OOD detection.
+         Methods and materials:
+         We used retrospective datasets from three sources: the National Lung Cancer Screening Trial (NLST, 16077 nodules with 1249 malignant), the Danish Lung Cancer Screening Trial (DLCST, 883 nodules with 65 malignant) and Clinical Routine data from a Dutch academic hospital (374 nodules with 207 malignant). NLST represents in-distribution data since it was used in the development of the AI model. DLCST, also comprising screening data, is categorized as near-OOD data. Clinical Routine data represents far-OOD data because of its diversity in CT protocols and disease incidence. We integrated the following three techniques into our AI model for malignancy risk estimation to calculate OOD scores for all nodules: maximum softmax probability (MSP), energy scoring (ES), and mahalanobis distance (MD) between the features of a test sample and the features of in-distribution samples. MSP takes the highest softmax output probability, while ES computes the log of the summed exponential values from the softmax output. MSP and ES exploit lower confidence in softmax outputs for OOD data. By categorizing NLST as in-distribution, and DLCST and Clinical routine as OOD, we assessed OOD detection performance using area under the ROC curve (AUC). For this, NLST was treated as negative samples, while DLCST and Clinical Routine were treated as positive samples.
+         Results:
+         For DLCST and Clinical Routine, the OOD methods based on MSP and ES showed a moderate ability to separate the data from NLST data with AUCs of 0.53 and 0.66, respectively. The OOD detection method based on MD demonstrated outstanding performance, achieving AUCs of 0.99 and 1.00, respectively.
+         Conclusion:
+         The MD-based OOD detection approach can be seamlessly integrated in an existing AI model and demonstrated to successfully detect far-OOD and near-OOD data. Integration of this approach could be a helpful tool to limit the AI model from failing silently on unseen and abnormal data, thereby enhancing patient safety.},
   optnote = {DIAG, RADIOLOGY},
   year = {2024},
 }
@@ -24218,21 +24250,21 @@ @article{Pfob22
   url = {http://dx.doi.org/10.1007/s00330-021-08519-z},
   volume = {32},
   abstract = {Abstract                 Objectives
-                                         AI-based algorithms for medical image analysis showed comparable performance to human image readers. However, in practice, diagnoses are made using multiple imaging modalities alongside other data sources. We determined the importance of this multi-modal information and compared the diagnostic performance of routine breast cancer diagnosis to breast ultrasound interpretations by humans or AI-based algorithms.
-
-                                         Methods
-                                         Patients were recruited as part of a multicenter trial (NCT02638935). The trial enrolled 1288 women undergoing routine breast cancer diagnosis (multi-modal imaging, demographic, and clinical information). Three physicians specialized in ultrasound diagnosis performed a second read of all ultrasound images. We used data from 11 of 12 study sites to develop two machine learning (ML) algorithms using unimodal information (ultrasound features generated by the ultrasound experts) to classify breast masses which were validated on the remaining study site. The same ML algorithms were subsequently developed and validated on multi-modal information (clinical and demographic information plus ultrasound features). We assessed performance using area under the curve (AUC).
-
-                                         Results
-                                         Of 1288 breast masses, 368 (28.6%) were histopathologically malignant. In the external validation set (n = 373), the performance of the two unimodal ultrasound ML algorithms (AUC 0.83 and 0.82) was commensurate with performance of the human ultrasound experts (AUC 0.82 to 0.84; p for all comparisons &gt; 0.05). The multi-modal ultrasound ML algorithms performed significantly better (AUC 0.90 and 0.89) but were statistically inferior to routine breast cancer diagnosis (AUC 0.95, p for all comparisons <= 0.05).
-
-                                         Conclusions
-                                         The performance of humans and AI-based algorithms improves with multi-modal information.
-
-                                         Key Points
-                                         * The performance of humans and AI-based algorithms improves with multi-modal information.
-                                         * Multimodal AI-based algorithms do not necessarily outperform expert humans.
-                                         * Unimodal AI-based algorithms do not represent optimal performance to classify breast masses.},
+                                           AI-based algorithms for medical image analysis showed comparable performance to human image readers. However, in practice, diagnoses are made using multiple imaging modalities alongside other data sources. We determined the importance of this multi-modal information and compared the diagnostic performance of routine breast cancer diagnosis to breast ultrasound interpretations by humans or AI-based algorithms.
+  
+                                           Methods
+                                           Patients were recruited as part of a multicenter trial (NCT02638935). The trial enrolled 1288 women undergoing routine breast cancer diagnosis (multi-modal imaging, demographic, and clinical information). Three physicians specialized in ultrasound diagnosis performed a second read of all ultrasound images. We used data from 11 of 12 study sites to develop two machine learning (ML) algorithms using unimodal information (ultrasound features generated by the ultrasound experts) to classify breast masses which were validated on the remaining study site. The same ML algorithms were subsequently developed and validated on multi-modal information (clinical and demographic information plus ultrasound features). We assessed performance using area under the curve (AUC).
+  
+                                           Results
+                                           Of 1288 breast masses, 368 (28.6%) were histopathologically malignant. In the external validation set (n = 373), the performance of the two unimodal ultrasound ML algorithms (AUC 0.83 and 0.82) was commensurate with performance of the human ultrasound experts (AUC 0.82 to 0.84; p for all comparisons &gt; 0.05). The multi-modal ultrasound ML algorithms performed significantly better (AUC 0.90 and 0.89) but were statistically inferior to routine breast cancer diagnosis (AUC 0.95, p for all comparisons <= 0.05).
+  
+                                           Conclusions
+                                           The performance of humans and AI-based algorithms improves with multi-modal information.
+  
+                                           Key Points
+                                           * The performance of humans and AI-based algorithms improves with multi-modal information.
+                                           * Multimodal AI-based algorithms do not necessarily outperform expert humans.
+                                           * Unimodal AI-based algorithms do not represent optimal performance to classify breast masses.},
   all_ss_ids = {[37a282e170103af22fc2ff850385a340c4394760]},
   automatic = {yes},
   citation-count = {7},
@@ -24269,9 +24301,9 @@ @article{Pfob24
   doi = {10.1158/1538-7445.sabcs23-po3-07-02},
   year = {2024},
   abstract = {Abstract
-                      Background: Breast Elastography, a technique that quantifies tissue stiffness, has been evaluated to objectify and improve the performance of B-mode breast ultrasound. However, large prospective trials showed benefits in BI-RADS 4a breast masses only and a high operator dependency. Modern Artificial Intelligence techniques for automated image processing like radiomics, a technique where quantified features are extracted from images, may overcome these limitations. We aimed to develop and validate radiomics models based on B-mode and Strain Elastography (SE) images for patients with BI-RADS 3 or 4 breast masses and compare their performance to the respective human experts. Methods: This is a secondary analysis of an international, multicenter trial (NCT02638935), evaluating the performance of SE in women with BI-RADS 3 or 4 breast masses. Women were recruited at 12 institutions in 7 countries and underwent B-mode breast ultrasound as well as SE. B-mode images were saved and re-assed by three ultrasound readers ( &amp;gt;10 years of experience), resulting in three independent assessments and a final consensus assessment. SE was interpreted using the E-/B ratio. B-mode and strain images were manually segmented and quantitative radiomics features were extracted using pyradiomics. We used 10-fold cross-validation to build machine learning models (XGBoostTree, MARS) based on data of 11 of 12 study sites. The data of the 12th (largest) study site was used as external validation set. Performance metrics included sensitivity, specificity and area under the receiver operator characteristic curve (AUROC). Results: The study included a total of 1288 patients, 1206 with evaluable B-mode images and 1190 with evaluable Strain images. Mean age was 46.6 years (SD 16.02) and a total number of 29.0% (350 of 1206) and 28.9% (344 of 1190) breast masses were malignant in the B-mode and Strain cohort, respectively. Distribution of BI-RADS categories was 33.0%, 34.5%, 14.5%, and 18.0% for BI-RADS 3, 4a, 4b, and 4c, respectively. In the external validation set (n = 342), the B-mode radiomics model (XGBoostTree) achieved an AUROC of 0.86 (95% CI 0.82 to 0.90), with a sensitivity of 97.4% (95% CI 0.93 to 1.00, 113 of 116) and a specificity of 27.0% (95% CI 0.21 to 0.33, 61 of 226). The model showed equivalent performance compared to the three ultrasound readers (P = 0.133); see also Table 1. In the external validation set (n = 333), the Strain radiomics model (MARS) achieved an AUROC of 0.84 (95% CI 0.79 to 0.88), with a sensitivity of 100% (95% CI 47.0 to 58.0, 115 of 115) and a specificity of 25.5% (95% CI 0.22 to 0.34, 60 of 218). The model showed equivalent performance compared to the three ultrasound readers (P = 0.696) and performed significantly better compared to SE (P = 0.002); see also Table 1. Sensitivity of the strain model was descriptively higher (100% vs. 97.4%, see Table 1). Both models were well-calibrated. Conclusion: This is the largest development and validation study for radiomics models based on B-mode breast ultrasound and SE, to date. The radiomics models performed on par with human readers, with the strain radiomics model showing potential to identify initially missed carcinomas in BI-RADS 3 breast masses. Future implementation studies may evaluate the performance of these image analysis algorithms in clinical routine and their integration into the multi-modal breast cancer diagnostics process, including mammography and MRI.
-                      Table 1. Diagnostic performance metrics
-                      Citation Format: Andr\'{e} Pfob, Tanja He, Lie Cai, Richard G. Barr, Volker Duda, Zaher Alwafai, Corinne Balleyguier, Dirk-Andr\'{e} Clevert, Sarah Fastner, Christina Gomez, Manuela Goncalo, Ines Gruber, Markus Hahn, Andr\'{e} Hennigs, Panagiotis Kapetas, Sheng-Chieh Lu, Juliane Nees, Ralf Ohlinger, Fabian Riedel, Matthieu Rutten, Benedikt Schaefgen, Anne Stieber, Riku Togawa, Mitsuhiro Tozaki, Sebastian Wojcinski, Cai Xu, Geraldine Rauch, J\"{o}rg Heil, Chris Sidey-Gibbons, Michael Golatta. Radiomics Models for B-mode Breast Ultrasound and Strain Elastography to improve Breast Cancer Diagnosis (INSPiRED 005): An International, Multicenter Analysis [abstract]. In: Proceedings of the 2023 San Antonio Breast Cancer Symposium; 2023 Dec 5-9; San Antonio, TX. Philadelphia (PA): AACR; Cancer Res 2024;84(9 Suppl):Abstract nr PO3-07-02.},
+                        Background: Breast Elastography, a technique that quantifies tissue stiffness, has been evaluated to objectify and improve the performance of B-mode breast ultrasound. However, large prospective trials showed benefits in BI-RADS 4a breast masses only and a high operator dependency. Modern Artificial Intelligence techniques for automated image processing like radiomics, a technique where quantified features are extracted from images, may overcome these limitations. We aimed to develop and validate radiomics models based on B-mode and Strain Elastography (SE) images for patients with BI-RADS 3 or 4 breast masses and compare their performance to the respective human experts. Methods: This is a secondary analysis of an international, multicenter trial (NCT02638935), evaluating the performance of SE in women with BI-RADS 3 or 4 breast masses. Women were recruited at 12 institutions in 7 countries and underwent B-mode breast ultrasound as well as SE. B-mode images were saved and re-assed by three ultrasound readers ( &amp;gt;10 years of experience), resulting in three independent assessments and a final consensus assessment. SE was interpreted using the E-/B ratio. B-mode and strain images were manually segmented and quantitative radiomics features were extracted using pyradiomics. We used 10-fold cross-validation to build machine learning models (XGBoostTree, MARS) based on data of 11 of 12 study sites. The data of the 12th (largest) study site was used as external validation set. Performance metrics included sensitivity, specificity and area under the receiver operator characteristic curve (AUROC). Results: The study included a total of 1288 patients, 1206 with evaluable B-mode images and 1190 with evaluable Strain images. Mean age was 46.6 years (SD 16.02) and a total number of 29.0% (350 of 1206) and 28.9% (344 of 1190) breast masses were malignant in the B-mode and Strain cohort, respectively. Distribution of BI-RADS categories was 33.0%, 34.5%, 14.5%, and 18.0% for BI-RADS 3, 4a, 4b, and 4c, respectively. In the external validation set (n = 342), the B-mode radiomics model (XGBoostTree) achieved an AUROC of 0.86 (95% CI 0.82 to 0.90), with a sensitivity of 97.4% (95% CI 0.93 to 1.00, 113 of 116) and a specificity of 27.0% (95% CI 0.21 to 0.33, 61 of 226). The model showed equivalent performance compared to the three ultrasound readers (P = 0.133); see also Table 1. In the external validation set (n = 333), the Strain radiomics model (MARS) achieved an AUROC of 0.84 (95% CI 0.79 to 0.88), with a sensitivity of 100% (95% CI 47.0 to 58.0, 115 of 115) and a specificity of 25.5% (95% CI 0.22 to 0.34, 60 of 218). The model showed equivalent performance compared to the three ultrasound readers (P = 0.696) and performed significantly better compared to SE (P = 0.002); see also Table 1. Sensitivity of the strain model was descriptively higher (100% vs. 97.4%, see Table 1). Both models were well-calibrated. Conclusion: This is the largest development and validation study for radiomics models based on B-mode breast ultrasound and SE, to date. The radiomics models performed on par with human readers, with the strain radiomics model showing potential to identify initially missed carcinomas in BI-RADS 3 breast masses. Future implementation studies may evaluate the performance of these image analysis algorithms in clinical routine and their integration into the multi-modal breast cancer diagnostics process, including mammography and MRI.
+                        Table 1. Diagnostic performance metrics
+                        Citation Format: Andr\'{e} Pfob, Tanja He, Lie Cai, Richard G. Barr, Volker Duda, Zaher Alwafai, Corinne Balleyguier, Dirk-Andr\'{e} Clevert, Sarah Fastner, Christina Gomez, Manuela Goncalo, Ines Gruber, Markus Hahn, Andr\'{e} Hennigs, Panagiotis Kapetas, Sheng-Chieh Lu, Juliane Nees, Ralf Ohlinger, Fabian Riedel, Matthieu Rutten, Benedikt Schaefgen, Anne Stieber, Riku Togawa, Mitsuhiro Tozaki, Sebastian Wojcinski, Cai Xu, Geraldine Rauch, J\"{o}rg Heil, Chris Sidey-Gibbons, Michael Golatta. Radiomics Models for B-mode Breast Ultrasound and Strain Elastography to improve Breast Cancer Diagnosis (INSPiRED 005): An International, Multicenter Analysis [abstract]. In: Proceedings of the 2023 San Antonio Breast Cancer Symposium; 2023 Dec 5-9; San Antonio, TX. Philadelphia (PA): AACR; Cancer Res 2024;84(9 Suppl):Abstract nr PO3-07-02.},
   url = {http://dx.doi.org/10.1158/1538-7445.sabcs23-po3-07-02},
   file = {Pfob24.pdf:pdf\\Pfob24.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -24491,9 +24523,9 @@ @article{Pinc22
   pages = {64},
   volume = {2},
   abstract = {Background: The first sign of metastatic prostate cancer after radical prostatectomy is rising PSA levels in the blood, termed biochemical recurrence. The prediction of recurrence relies mainly on the morphological assessment of prostate cancer using the Gleason grading system. However, in this system, within-grade morphological patterns and subtle histopathological features are currently omitted, leaving a significant amount of prognostic potential unexplored.
-                             Methods: To discover additional prognostic information using artificial intelligence, we trained a deep learning system to predict biochemical recurrence from tissue in H\&E-stained microarray cores directly. We developed a morphological biomarker using convolutional neural networks leveraging a nested case-control study of 685 patients and validated on an independent cohort of 204 patients. We use concept-based explainability methods to interpret the learned tissue patterns.
-                             Results: The biomarker provides a strong correlation with biochemical recurrence in two sets (n = 182 and n = 204) from separate institutions. Concept-based explanations provided tissue patterns interpretable by pathologists.
-                             Conclusions: These results show that the model finds predictive power in the tissue beyond the morphological ISUP grading.},
+                               Methods: To discover additional prognostic information using artificial intelligence, we trained a deep learning system to predict biochemical recurrence from tissue in H\&E-stained microarray cores directly. We developed a morphological biomarker using convolutional neural networks leveraging a nested case-control study of 685 patients and validated on an independent cohort of 204 patients. We use concept-based explainability methods to interpret the learned tissue patterns.
+                               Results: The biomarker provides a strong correlation with biochemical recurrence in two sets (n = 182 and n = 204) from separate institutions. Concept-based explanations provided tissue patterns interpretable by pathologists.
+                               Conclusions: These results show that the model finds predictive power in the tissue beyond the morphological ISUP grading.},
   file = {:pdf/Pinc22.pdf:PDF},
   journal = COMMMED,
   pmid = {35693032},
@@ -24681,16 +24713,16 @@ @article{Pomp16a
   pages = {2008-2013},
   doi = {10.1016/j.ejrad.2016.09.009},
   abstract = {Objectives
-                                                       Airway wall thickness (AWT) is affected by changes in lung volume. This study evaluated whether correcting AWT on computed tomography (CT) for differences in inspiration level improves measurement agreement, reliability, and power to detect changes over time.
-
-                                                       Methods
-                                                       Participants of the Dutch-Belgian lung cancer screening trial who underwent 3-month repeat CT for an indeterminate pulmonary nodule were included. AWT on CT was calculated by the square root of the wall area at a theoretical airway with an internal perimeter of 10?mm (Pi10). The scan with the highest lung volume was labelled as the reference scan and the scan with the lowest lung volume was labelled as the comparison scan. Pi10 derived from the comparison scan was corrected by multiplying it with the ratio of CT lung volume of the comparison scan to CT lung volume on the reference scan. Agreement of uncorrected and corrected Pi10 was studied with the Bland-Altman method, reliability with intra-class correlation coefficients (ICC), and power to detect changes over time was calculated.
-
-                                                       Results
-                                                       315 male participants were included. Limit of agreement and reliability for Pi10 was ?0.61 to 0.57?mm (ICC?=?0.87), which improved to ?0.38 to 0.37?mm (ICC?=?0.94) after correction for inspiration level. To detect a 15% change over 3 months, 71 subjects are needed for Pi10 and 26 subjects for Pi10 adjusted for inspiration level.
-
-                                                       Conclusions
-                                                       Correcting Pi10 for differences in inspiration level improves reliability, agreement, and power to detect changes over time.},
+                                                         Airway wall thickness (AWT) is affected by changes in lung volume. This study evaluated whether correcting AWT on computed tomography (CT) for differences in inspiration level improves measurement agreement, reliability, and power to detect changes over time.
+  
+                                                         Methods
+                                                         Participants of the Dutch-Belgian lung cancer screening trial who underwent 3-month repeat CT for an indeterminate pulmonary nodule were included. AWT on CT was calculated by the square root of the wall area at a theoretical airway with an internal perimeter of 10?mm (Pi10). The scan with the highest lung volume was labelled as the reference scan and the scan with the lowest lung volume was labelled as the comparison scan. Pi10 derived from the comparison scan was corrected by multiplying it with the ratio of CT lung volume of the comparison scan to CT lung volume on the reference scan. Agreement of uncorrected and corrected Pi10 was studied with the Bland-Altman method, reliability with intra-class correlation coefficients (ICC), and power to detect changes over time was calculated.
+  
+                                                         Results
+                                                         315 male participants were included. Limit of agreement and reliability for Pi10 was ?0.61 to 0.57?mm (ICC?=?0.87), which improved to ?0.38 to 0.37?mm (ICC?=?0.94) after correction for inspiration level. To detect a 15% change over 3 months, 71 subjects are needed for Pi10 and 26 subjects for Pi10 adjusted for inspiration level.
+  
+                                                         Conclusions
+                                                         Correcting Pi10 for differences in inspiration level improves reliability, agreement, and power to detect changes over time.},
   file = {Pomp16a.pdf:pdf\\Pomp16a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {27776653},
@@ -24743,7 +24775,7 @@ @article{Post13a
   pages = {114-121},
   volume = {63},
   abstract = {Aims: Virtual microscopy offers major advantages for pathology practice, separating slide evaluation from slide production. The aim of this study was to investigate the reliability of using whole slide images as compared with routine glass slides for diagnostic purposes. Methods and results: Colon biopsies (n = 295) were assessed using both glass slides and whole slide images by four pathologists and two residents. Two pathologists scored the digital images of biopsies in a primary diagnostic setting. For each case, the consensus diagnosis was de?ned as the majority diagnosis on the study's glass slides. All diagnoses were grouped into seven main diagnostic categories, and further divided into subgroups. The overall concor
-                                                       dance rates were 89.6% for whole slide images and 91.6% for light microscopy. The concordance rates of the subgroups 'adenoma' and 'adenocarcinoma' between whole slide images and conventional microscopy showed only small variability. The intraobserver (whole slide images versus glass slide) agreement, including subgroups, was substantial, with a mean j-value of 0.78, and was higher than the interobserver agreement for glass slides (interobserver j-value of 0.69). Conclusions: This study shows good diagnostic accuracy and reproducibility for virtual microscopy, indicating that this technology can reliably be used for pathological evaluation of colon biopsies in a primary clinical setting.},
+                                                         dance rates were 89.6% for whole slide images and 91.6% for light microscopy. The concordance rates of the subgroups 'adenoma' and 'adenocarcinoma' between whole slide images and conventional microscopy showed only small variability. The intraobserver (whole slide images versus glass slide) agreement, including subgroups, was substantial, with a mean j-value of 0.78, and was higher than the interobserver agreement for glass slides (interobserver j-value of 0.69). Conclusions: This study shows good diagnostic accuracy and reproducibility for virtual microscopy, indicating that this technology can reliably be used for pathological evaluation of colon biopsies in a primary clinical setting.},
   file = {Post13.pdf:pdf\\post13.pdf:PDF},
   journal = Histopathology,
   month = {3},
@@ -25106,19 +25138,20 @@ @book{Rao21
   gscites = {3},
 }
 
-@inproceedings{Raza15,
-  author = {Razavi, Mohammad and Wang, Lei and Gubern-M\'{e}rida, Albert and Ivanovska, Tatyana and Laue, Hendrik and Karssemeijer, Nico and Hahn, Horst K},
-  title = {Towards Accurate Segmentation of Fibroglandular Tissue in Breast {MRI} Using Fuzzy C-Means and Skin-Folds Removal},
-  booktitle = {Image Analysis and ProcessingAC/a,!aEUR?ICIAP 2015},
+@book{Raza15,
+  author = {Razavi, Mohammad and Wang, Lei and Gubern-M\'{e}rida, Albert and Ivanovska, Tatyana and Laue, Hendrik and Karssemeijer, Nico and Hahn, Horst K.},
+  title = {Towards Accurate Segmentation of Fibroglandular Tissue in Breast MRI Using Fuzzy C-Means and Skin-Folds Removal},
+  doi = {10.1007/978-3-319-23231-7_47},
   year = {2015},
-  publisher = {Springer},
-  pages = {528--536},
+  abstract = {Abstract unavailable},
+  url = {http://dx.doi.org/10.1007/978-3-319-23231-7_47},
   file = {Raza15.pdf:pdf\\Raza15.pdf:PDF},
-  optnote = {DIAG},
-  gsid = {8743979472382399251},
-  gscites = {11},
-  ss_id = {ba17f21a7df9005f18a97e1385b83d90709c8ebe},
+  optnote = {DIAG, RADIOLOGY},
+  journal = {Lecture Notes in Computer Science},
+  automatic = {yes},
   all_ss_ids = {['ba17f21a7df9005f18a97e1385b83d90709c8ebe']},
+  citation-count = {8},
+  pages = {528-536},
 }
 
 @book{Raza16,
@@ -25194,7 +25227,7 @@ @inproceedings{Rein21
   title = {Common limitations of performance metrics in biomedical image analysis},
   doi = {10.1117/12.2549650},
   abstract = {Diffuse large B-cell lymphoma (DLBCL) is the most common type of B-cell lymphoma. It is characterized by a heterogeneous morphology, genetic changes and clinical behavior. A small specific subgroup of DLBCL, harbouring a MYC gene translocation is associated with worse patient prognosis and outcome. Typically, the MYC translocation is assessed with a molecular test (FISH), that is expensive and time-consuming. Our hypothesis is that genetic changes, such as translocations could be visible as changes in the morphology of an HE-stained specimen. However, it has not proven possible to use morphological criteria for the detection of a MYC translocation in the diagnostic setting due to lack of specificity.
-                             In this paper, we apply a deep learning model to automate detection of the MYC translocations in DLBCL based on HE-stained specimens. The proposed method works at the whole-slide level and was developed based on a multicenter data cohort of 91 patients. All specimens were stained with HE, and the MYC translocation was confirmed using fluorescence in situ hybridization (FISH). The system was evaluated on an additional 66 patients, and obtained AUROC of 0.83 and accuracy of 0.77. The proposed method presents proof of a concept giving insights in the applicability of deep learning methods for detection of a genetic changes in DLBCL. In future work we will evaluate our algorithm for automatic pre-screen of DLBCL specimens to obviate FISH analysis in a large number of patients.},
+                               In this paper, we apply a deep learning model to automate detection of the MYC translocations in DLBCL based on HE-stained specimens. The proposed method works at the whole-slide level and was developed based on a multicenter data cohort of 91 patients. All specimens were stained with HE, and the MYC translocation was confirmed using fluorescence in situ hybridization (FISH). The system was evaluated on an additional 66 patients, and obtained AUROC of 0.83 and accuracy of 0.77. The proposed method presents proof of a concept giving insights in the applicability of deep learning methods for detection of a genetic changes in DLBCL. In future work we will evaluate our algorithm for automatic pre-screen of DLBCL specimens to obviate FISH analysis in a large number of patients.},
   all_ss_ids = {['a9dc7e9f174dbd624d5a8294ca6dc9b671e1ae97']},
   file = {:pdf/Rein21.pdf:PDF},
   gscites = {3},
@@ -25507,7 +25540,7 @@ @mastersthesis{Rijt19
   author = {Mart van Rijthoven},
   title = {Cancer research in digital pathology using convolutional neural networks},
   abstract = {Understanding the progression of cancer is at the core of cancer research. In this thesis we combine high resolution features with low resolution contextual features to automatic segment cancerous associated tissue in gigapixel histopathology whole slide images (WSIs) stained with hematoxylin and eosin. We take advantage of the multi-resolution data structure of WSIs and obtain contextual features through the use of dilated convolutions. Our proposed multi-resolution method has a comparable F1-score performance compared to a single low resolution method. Furthermore, the proposed method increases the F1-score by 34% for ductal Carcinoma In Situ (DCIS) and 12% for invasive ductal carcinoma (IDC) in comparison with a single high resolution method.
-                             Lymphocytes play an important role in the progression of cancer. In the second part of this thesis, we boost the potential of the You Only Look Once (YOLO) architecture applied to automatic detection of lymphocytes in WSIs stained with immunohistochemistry by (1) tailoring the YOLO architecture to lymphocyte detection in WSI; (2) guiding training data sampling by exploiting prior knowledge on hard negative samples; (3) pairing the proposed sampling strategy with the focal loss technique. The combination of the proposed improvements increases the F1-score of YOLO by 3% with a speed-up of 4.3X.},
+                               Lymphocytes play an important role in the progression of cancer. In the second part of this thesis, we boost the potential of the You Only Look Once (YOLO) architecture applied to automatic detection of lymphocytes in WSIs stained with immunohistochemistry by (1) tailoring the YOLO architecture to lymphocyte detection in WSI; (2) guiding training data sampling by exploiting prior knowledge on hard negative samples; (3) pairing the proposed sampling strategy with the focal loss technique. The combination of the proposed improvements increases the F1-score of YOLO by 3% with a speed-up of 4.3X.},
   file = {Rijt19.pdf:pdf\\Rijt19.pdf:PDF},
   optnote = {DIAG},
   school = {University of Utrecht},
@@ -26260,21 +26293,21 @@ @article{Roes24
   doi = {10.1097/rli.0000000000001102},
   year = {2024},
   abstract = {
-             Objectives
-             Deep learning (DL) studies for the detection of clinically significant prostate cancer (csPCa) on magnetic resonance imaging (MRI) often overlook potentially relevant clinical parameters such as prostate-specific antigen, prostate volume, and age. This study explored the integration of clinical parameters and MRI-based DL to enhance diagnostic accuracy for csPCa on MRI.
-
-
-             Materials and Methods
-             We retrospectively analyzed 932 biparametric prostate MRI examinations performed for suspected csPCa (ISUP >=2) at 2 institutions. Each MRI scan was automatically analyzed by a previously developed DL model to detect and segment csPCa lesions. Three sets of features were extracted: DL lesion suspicion levels, clinical parameters (prostate-specific antigen, prostate volume, age), and MRI-based lesion volumes for all DL-detected lesions. Six multimodal artificial intelligence (AI) classifiers were trained for each combination of feature sets, employing both early (feature-level) and late (decision-level) information fusion methods. The diagnostic performance of each model was tested internally on 20% of center 1 data and externally on center 2 data (n = 529). Receiver operating characteristic comparisons determined the optimal feature combination and information fusion method and assessed the benefit of multimodal versus unimodal analysis. The optimal model performance was compared with a radiologist using PI-RADS.
-
-
-             Results
-             Internally, the multimodal AI integrating DL suspicion levels with clinical features via early fusion achieved the highest performance. Externally, it surpassed baselines using clinical parameters (0.77 vs 0.67 area under the curve [AUC], <jats:italic toggle="yes">P &lt; 0.001) and DL suspicion levels alone (AUC: 0.77 vs 0.70, <jats:italic toggle="yes">P = 0.006). Early fusion outperformed late fusion in external data (0.77 vs 0.73 AUC, <jats:italic toggle="yes">P = 0.005). No significant performance gaps were observed between multimodal AI and radiologist assessments (internal: 0.87 vs 0.88 AUC; external: 0.77 vs 0.75 AUC, both <jats:italic toggle="yes">P &gt; 0.05).
-
-
-             Conclusions
-             Multimodal AI (combining DL suspicion levels and clinical parameters) outperforms clinical and MRI-only AI for csPCa detection. Early information fusion enhanced AI robustness in our multicenter setting. Incorporating lesion volumes did not enhance diagnostic efficacy.
-           },
+               Objectives
+               Deep learning (DL) studies for the detection of clinically significant prostate cancer (csPCa) on magnetic resonance imaging (MRI) often overlook potentially relevant clinical parameters such as prostate-specific antigen, prostate volume, and age. This study explored the integration of clinical parameters and MRI-based DL to enhance diagnostic accuracy for csPCa on MRI.
+  
+  
+               Materials and Methods
+               We retrospectively analyzed 932 biparametric prostate MRI examinations performed for suspected csPCa (ISUP >=2) at 2 institutions. Each MRI scan was automatically analyzed by a previously developed DL model to detect and segment csPCa lesions. Three sets of features were extracted: DL lesion suspicion levels, clinical parameters (prostate-specific antigen, prostate volume, age), and MRI-based lesion volumes for all DL-detected lesions. Six multimodal artificial intelligence (AI) classifiers were trained for each combination of feature sets, employing both early (feature-level) and late (decision-level) information fusion methods. The diagnostic performance of each model was tested internally on 20% of center 1 data and externally on center 2 data (n = 529). Receiver operating characteristic comparisons determined the optimal feature combination and information fusion method and assessed the benefit of multimodal versus unimodal analysis. The optimal model performance was compared with a radiologist using PI-RADS.
+  
+  
+               Results
+               Internally, the multimodal AI integrating DL suspicion levels with clinical features via early fusion achieved the highest performance. Externally, it surpassed baselines using clinical parameters (0.77 vs 0.67 area under the curve [AUC], <jats:italic toggle="yes">P &lt; 0.001) and DL suspicion levels alone (AUC: 0.77 vs 0.70, <jats:italic toggle="yes">P = 0.006). Early fusion outperformed late fusion in external data (0.77 vs 0.73 AUC, <jats:italic toggle="yes">P = 0.005). No significant performance gaps were observed between multimodal AI and radiologist assessments (internal: 0.87 vs 0.88 AUC; external: 0.77 vs 0.75 AUC, both <jats:italic toggle="yes">P &gt; 0.05).
+  
+  
+               Conclusions
+               Multimodal AI (combining DL suspicion levels and clinical parameters) outperforms clinical and MRI-only AI for csPCa detection. Early information fusion enhanced AI robustness in our multicenter setting. Incorporating lesion volumes did not enhance diagnostic efficacy.
+             },
   url = {http://dx.doi.org/10.1097/RLI.0000000000001102},
   file = {Roes24.pdf:pdf\\Roes24.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -26333,19 +26366,19 @@ @article{Roo22
   url = {http://dx.doi.org/10.1097/GOX.0000000000004495},
   volume = {10},
   abstract = {Background:
-                                     Surgeons often prefer to use a tourniquet during minor procedures, such as carpal tunnel release (CTR) or trigger finger release (TFR). Besides the possible discomfort for the patient, the effect of tourniquet use on long-term results and complications is unknown. Our primary aim was to compare the patient-reported outcomes 1 year after CTR or TFR under local anesthesia with or without tourniquet. Secondary outcomes included satisfaction, sonographically estimated scar tissue thickness after CTR, and postoperative complications.
-
-
-                                     Methods:
-                                     Between May 2019 and May 2020, 163 patients planned for open CTR or TFR under local anesthesia were included. Before surgery, and at 3, 6, and 12 months postoperatively, Quick Disabilities of the Arm, Shoulder and Hand and Boston Carpal Tunnel questionnaires were administered, and complications were noted. At 6 months postoperatively, an ultrasound was conducted to determine the thickness of scar tissue in the region of median nerve.
-
-
-                                     Results:
-                                     A total of 142 patients (51 men [38%]) were included. The Quick Disabilities of the Arm, Shoulder and Hand questionnaire and Boston Carpal Tunnel Questionnaire scores improved significantly in both groups during follow-up, wherein most improvements were seen in the first 3 months. No difference in clinical outcome and scar tissue formation was found between the two groups after 12 months. The complication rate was comparable between both groups. Thirty-two (24%) patients had at least one complication, none needed surgical interventions, and no recurrent symptoms were seen.
-
-
-                                     Conclusions:
-                                     Our study shows similar long-term clinical outcomes, formation of scar tissue, and complication rates for patients undergoing CTR or TFR with or without a tourniquet. Tourniquet usage should be based on shared decision-making.},
+                                       Surgeons often prefer to use a tourniquet during minor procedures, such as carpal tunnel release (CTR) or trigger finger release (TFR). Besides the possible discomfort for the patient, the effect of tourniquet use on long-term results and complications is unknown. Our primary aim was to compare the patient-reported outcomes 1 year after CTR or TFR under local anesthesia with or without tourniquet. Secondary outcomes included satisfaction, sonographically estimated scar tissue thickness after CTR, and postoperative complications.
+  
+  
+                                       Methods:
+                                       Between May 2019 and May 2020, 163 patients planned for open CTR or TFR under local anesthesia were included. Before surgery, and at 3, 6, and 12 months postoperatively, Quick Disabilities of the Arm, Shoulder and Hand and Boston Carpal Tunnel questionnaires were administered, and complications were noted. At 6 months postoperatively, an ultrasound was conducted to determine the thickness of scar tissue in the region of median nerve.
+  
+  
+                                       Results:
+                                       A total of 142 patients (51 men [38%]) were included. The Quick Disabilities of the Arm, Shoulder and Hand questionnaire and Boston Carpal Tunnel Questionnaire scores improved significantly in both groups during follow-up, wherein most improvements were seen in the first 3 months. No difference in clinical outcome and scar tissue formation was found between the two groups after 12 months. The complication rate was comparable between both groups. Thirty-two (24%) patients had at least one complication, none needed surgical interventions, and no recurrent symptoms were seen.
+  
+  
+                                       Conclusions:
+                                       Our study shows similar long-term clinical outcomes, formation of scar tissue, and complication rates for patients undergoing CTR or TFR with or without a tourniquet. Tourniquet usage should be based on shared decision-making.},
   all_ss_ids = {[7a4282bc46da2171e4bd282fda4f37caac50f3ba]},
   automatic = {yes},
   citation-count = {0},
@@ -26523,17 +26556,17 @@ @article{Rutg21
   url = {http://dx.doi.org/10.1186/s13000-021-01136-w},
   volume = {16},
   abstract = {Abstract
-                                         Background
-                                         Histopathological classification of Wilms tumors determines treatment regimen. Machine learning has been shown to contribute to histopathological classification in various malignancies but requires large numbers of manually annotated images and thus specific pathological knowledge. This study aimed to assess whether trained, inexperienced observers could contribute to reliable annotation of Wilms tumor components for classification performed by machine learning.
-
-                                         Methods
-                                         Four inexperienced observers (medical students) were trained in histopathology of normal kidneys and Wilms tumors by an experienced observer (pediatric pathologist). Twenty randomly selected scanned Wilms tumor-slides (from n = 1472 slides) were annotated, and annotations were independently classified by both the inexperienced observers and two experienced pediatric pathologists. Agreement between the six observers and for each tissue element was measured using kappa statistics (k).
-
-                                         Results
-                                         Pairwise interobserver agreement between all inexperienced and experienced observers was high (range: 0.845-0.950). The interobserver variability for the different histological elements, including all vital tumor components and therapy-related effects, showed high values for all k-coefficients (&gt; 0.827).
-
-                                         Conclusions
-                                         Inexperienced observers can be trained to recognize specific histopathological tumor and tissue elements with high interobserver agreement with experienced observers. Nevertheless, supervision by experienced pathologists remains necessary. Results of this study can be used to facilitate more rapid progress for supervised machine learning-based algorithm development in pediatric pathology and beyond.},
+                                           Background
+                                           Histopathological classification of Wilms tumors determines treatment regimen. Machine learning has been shown to contribute to histopathological classification in various malignancies but requires large numbers of manually annotated images and thus specific pathological knowledge. This study aimed to assess whether trained, inexperienced observers could contribute to reliable annotation of Wilms tumor components for classification performed by machine learning.
+  
+                                           Methods
+                                           Four inexperienced observers (medical students) were trained in histopathology of normal kidneys and Wilms tumors by an experienced observer (pediatric pathologist). Twenty randomly selected scanned Wilms tumor-slides (from n = 1472 slides) were annotated, and annotations were independently classified by both the inexperienced observers and two experienced pediatric pathologists. Agreement between the six observers and for each tissue element was measured using kappa statistics (k).
+  
+                                           Results
+                                           Pairwise interobserver agreement between all inexperienced and experienced observers was high (range: 0.845-0.950). The interobserver variability for the different histological elements, including all vital tumor components and therapy-related effects, showed high values for all k-coefficients (&gt; 0.827).
+  
+                                           Conclusions
+                                           Inexperienced observers can be trained to recognize specific histopathological tumor and tissue elements with high interobserver agreement with experienced observers. Nevertheless, supervision by experienced pathologists remains necessary. Results of this study can be used to facilitate more rapid progress for supervised machine learning-based algorithm development in pediatric pathology and beyond.},
   all_ss_ids = {[2dc667c51f9d881cdb16ddb590c86b35a910506c]},
   automatic = {yes},
   citation-count = {3},
@@ -26642,8 +26675,8 @@ @article{Saha21a
   doi = {10.1016/j.media.2021.102155},
   url = {https://www.sciencedirect.com/science/article/pii/S1361841521002012},
   abstract = {We present a multi-stage 3D computer-aided detection and diagnosis (CAD) model for automated localization of clinically significant prostate cancer (csPCa) in bi-parametric MR imaging (bpMRI). Deep attention mechanisms drive its detection network, targeting salient structures and highly discriminative feature dimensions across multiple resolutions. Its goal is to accurately identify csPCa lesions from indolent cancer and the wide range of benign pathology that can afflict the prostate gland. Simultaneously, a decoupled residual classifier is used to achieve consistent false positive reduction, without sacrificing high sensitivity or computational efficiency. In order to guide model generalization with domain-specific clinical knowledge, a probabilistic anatomical prior is used to encode the spatial prevalence and zonal distinction of csPCa. Using a large dataset of 1950 prostate bpMRI paired with radiologically-estimated annotations, we hypothesize that such CNN-based models can be trained to detect biopsy-confirmed malignancies in an independent cohort.
-
-                               For 486 institutional testing scans, the 3D CAD system achieves 83.69+-5.22% and 93.19+-2.96% detection sensitivity at 0.50 and 1.46 false positive(s) per patient, respectively, with 0.882+-0.030 AUROC in patient-based diagnosis -significantly outperforming four state-of-the-art baseline architectures (U-SEResNet, UNet++, nnU-Net, Attention U-Net) from recent literature. For 296 external biopsy-confirmed testing scans, the ensembled CAD system shares moderate agreement with a consensus of expert radiologists (76.69%; kappa = 0.51+-0.04) and independent pathologists (81.08%; kappa = 0.56+-0.06); demonstrating strong generalization to histologically-confirmed csPCa diagnosis.},
+  
+                                 For 486 institutional testing scans, the 3D CAD system achieves 83.69+-5.22% and 93.19+-2.96% detection sensitivity at 0.50 and 1.46 false positive(s) per patient, respectively, with 0.882+-0.030 AUROC in patient-based diagnosis -significantly outperforming four state-of-the-art baseline architectures (U-SEResNet, UNet++, nnU-Net, Attention U-Net) from recent literature. For 296 external biopsy-confirmed testing scans, the ensembled CAD system shares moderate agreement with a consensus of expert radiologists (76.69%; kappa = 0.51+-0.04) and independent pathologists (81.08%; kappa = 0.56+-0.06); demonstrating strong generalization to histologically-confirmed csPCa diagnosis.},
   optnote = {DIAG, RADIOLOGY},
   algorithm = {https://grand-challenge.org/algorithms/prostate-mri-cad-cspca/},
   gsid = {10384137846444027679},
@@ -26756,7 +26789,7 @@ @article{Sama24
   doi = {10.1093/bjrai/ubae006},
   year = {2024},
   abstract = {Abstract
-                      Innovation in medical imaging artificial intelligence (AI)/machine learning (ML) demands extensive data collection, algorithmic advancements, and rigorous performance assessments encompassing aspects such as generalizability, uncertainty, bias, fairness, trustworthiness, and interpretability. Achieving widespread integration of AI/ML algorithms into diverse clinical tasks will demand a steadfast commitment to overcoming issues in model design, development, and performance assessment. The complexities of AI/ML clinical translation present substantial challenges, requiring engagement with relevant stakeholders, assessment of cost-effectiveness for user and patient benefit, timely dissemination of information relevant to robust functioning throughout the AI/ML lifecycle, consideration of regulatory compliance, and feedback loops for real-world performance evidence. This commentary addresses several hurdles for the development and adoption of AI/ML technologies in medical imaging. Comprehensive attention to these underlying and often subtle factors is critical not only for tackling the challenges but also for exploring novel opportunities for the advancement of AI in radiology.},
+                        Innovation in medical imaging artificial intelligence (AI)/machine learning (ML) demands extensive data collection, algorithmic advancements, and rigorous performance assessments encompassing aspects such as generalizability, uncertainty, bias, fairness, trustworthiness, and interpretability. Achieving widespread integration of AI/ML algorithms into diverse clinical tasks will demand a steadfast commitment to overcoming issues in model design, development, and performance assessment. The complexities of AI/ML clinical translation present substantial challenges, requiring engagement with relevant stakeholders, assessment of cost-effectiveness for user and patient benefit, timely dissemination of information relevant to robust functioning throughout the AI/ML lifecycle, consideration of regulatory compliance, and feedback loops for real-world performance evidence. This commentary addresses several hurdles for the development and adoption of AI/ML technologies in medical imaging. Comprehensive attention to these underlying and often subtle factors is critical not only for tackling the challenges but also for exploring novel opportunities for the advancement of AI in radiology.},
   url = {http://dx.doi.org/10.1093/bjrai/ubae006},
   file = {Sama24.pdf:pdf\\Sama24.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -27177,52 +27210,52 @@ @conference{Sand19
   booktitle = ISMRM,
   year = {2019},
   abstract = {Synopsis
-                                                       The aim of this study was to compare a prototype simultaneous multi-slice single-shot echo planar imaging (SMS-ss-DWI-EPI) sequence with
-                                                       conventional readout-segmented echo-planar imaging (rs-DWI-EPI) for diffusion-weighted imaging of the breast at 3T magnetic resonance
-                                                       imaging (MRI). A reader study was conducted to evaluate image quality, lesion conspicuity and BI-RADS score. Our results show that although
-                                                       the image quality with the conventional rs-DWI-EPI is superior, malignant lesions have improved visibility with the SMS-ss-DWI-EPI sequence.
-
-                                                       Introduction
-                                                       The addition of diffusion-weighted imaging (DWI) to contrast-enhanced breast MRI improves the classification of breast lesions, which leads in turn to an
-                                                       increased positive predictive value of biopsies. Consequently, DWI with evaluation of the corresponding apparent diffusion coefficient (ADC) is included in
-                                                       most state-of-the-art breast MRI protocols . The echo train of the readout-segmented echo-planar imaging-based DWI sequence (rs-DWI-EPI) was
-                                                       shortened to reduce distortion and improve the resulting image quality. However, this sequence results in a lower signal-to-noise ratio (SNR) than singleshot
-                                                       echo planar imaging (ss-EPI) . In practice, detection of lesions on DWI is often problematic due to a relatively low lesion conspicuity. To improve the
-                                                       detectability of lesions and the speed of acquisition, a prototype DWI sequence, the simultaneous multi-slice single-shot DWI-EPI (SMS-ss-DWI-EPI), was
-                                                       developed. In this study we compare this prototype sequence with rs-DWI-EPI at 3T, in terms of image quality (IQ), lesion conspicuity, and breast imaging
-                                                       reporting and data system (BI-RADS ) score.
-
-                                                       Methods
-                                                       From September 2017 to August 2018, 15 women with known breast cancer or suspicious breast lesions were included, after providing signed informed
-                                                       consent. Women were scanned with the conventional rs-DWI-EPI and the SMS-ss-DWI-EPI during the same clinical examination on a 3T MAGNETOM
-                                                       Skyra system (Siemens Healthcare, Erlangen, Germany) using a 16-channel bilateral breast coil. Parameters of the rs-DWI-EPI sequence were: TR: 5450
-                                                       ms, TE: 57 ms, FoV: 340 mm, voxel size: 1.2x1.2x5 mm , acquisition time: 4:23 min, b-values: 50, 850 s/mm , SPAIR fat suppression. Parameters of the
-                                                       SMS-ss-DWI-EPI sequence were: TR: 4000 ms, TE: 70 ms, FoV: 360 mm, voxel size: 0.9(i)x0.9(i)x4 mm , acquisition time: 2:45 min, b-values: 50, 400,
-                                                       800 s/mm , SPAIR fat suppression. In addition, the clinical protocol included one pre- and five post-contrast administration regular T1-weighted Dixon
-                                                       acquisitions, ultrafast T1-weighted TWIST acquisitions during the inflow of contrast, and a T2 weighted Dixon acquisition. In total, 33 lesions (27 malignant,
-                                                       5 benign and 1 unknown) were detected on the contrast-enhanced series and described in the clinical MRI reports. Two dedicated breast radiologists (4
-                                                       and 10 years of experience with breast MRI) independently scored both sequences for overall IQ (1: extremely poor to 9: excellent). All lesions were also
-                                                       independently evaluated for conspicuity (1: not visible, 2: visible if location is given, 3: visible), and a BI-RADS score (1 to 5) was given for each lesion.
-                                                       Statistical analysis was performed in SPSS using the Wilcoxon signed-rank test.
-
-                                                       Results
-                                                       Results are presented in Table 1. Overall IQ was significantly higher for the conventional rs-DWI-EPI than for the SMS-ss-DWI-EPI (p=0.006). Lesion
-                                                       conspicuity scores were significantly higher for SMS-ss-DWI-EPI (p=0.016). Benign lesions had similar conspicuity with both sequences while malignant
-                                                       lesions had significantly higher conspicuity with SMS-ss-DWI-EPI (p=0.027) (for example, see Figure 1). There was no significant difference in BI-RADS
-                                                       scores (p=0.151) between the two sequences.
-
-                                                       Discussion
-                                                       Although the conventional rs-DWI-EPI sequence results in better IQ, in general ss-EPI results in a higher SNR, which may lead to better visibility of
-                                                       malignant lesions with SMS-ss-DWI-EPI. This might eventually improve the clinical value of DWI in addition to contrast enhanced breast MRI.
-                                                       Simultaneous Multi-Slice (SMS) ensures that slices are excited simultaneously with a multiband pulse, which leads to a reduced acquisition time. In our
-                                                       protocol, the combination of ss-EPI and SMS results in a higher spatial resolution while still having a shorter acquisition time than the conventional
-                                                       sequence. The higher achievable spatial resolution may be an important factor for the improved lesion visibility, and conspicuity of malignant lesions. This
-                                                       may make the SMS approach suitable for fast screening and diagnosis of breast cancer. Still, further development of the SMS-ss-DWI-EPI sequence is
-                                                       needed for improved IQ and even better lesion conspicuity. Extension of the data pool and evaluation by additional readers is pending.
-
-                                                       Conclusion
-                                                       Despite the perceived poorer image quality of the SMS-ss-DWI-EPI sequence, malignant lesions are better visualized using this sequence. When image
-                                                       quality and conspicuity are further improved, this technique might enable improved lesion detection on unenhanced diffusion weighted breast MRI.},
+                                                         The aim of this study was to compare a prototype simultaneous multi-slice single-shot echo planar imaging (SMS-ss-DWI-EPI) sequence with
+                                                         conventional readout-segmented echo-planar imaging (rs-DWI-EPI) for diffusion-weighted imaging of the breast at 3T magnetic resonance
+                                                         imaging (MRI). A reader study was conducted to evaluate image quality, lesion conspicuity and BI-RADS score. Our results show that although
+                                                         the image quality with the conventional rs-DWI-EPI is superior, malignant lesions have improved visibility with the SMS-ss-DWI-EPI sequence.
+  
+                                                         Introduction
+                                                         The addition of diffusion-weighted imaging (DWI) to contrast-enhanced breast MRI improves the classification of breast lesions, which leads in turn to an
+                                                         increased positive predictive value of biopsies. Consequently, DWI with evaluation of the corresponding apparent diffusion coefficient (ADC) is included in
+                                                         most state-of-the-art breast MRI protocols . The echo train of the readout-segmented echo-planar imaging-based DWI sequence (rs-DWI-EPI) was
+                                                         shortened to reduce distortion and improve the resulting image quality. However, this sequence results in a lower signal-to-noise ratio (SNR) than singleshot
+                                                         echo planar imaging (ss-EPI) . In practice, detection of lesions on DWI is often problematic due to a relatively low lesion conspicuity. To improve the
+                                                         detectability of lesions and the speed of acquisition, a prototype DWI sequence, the simultaneous multi-slice single-shot DWI-EPI (SMS-ss-DWI-EPI), was
+                                                         developed. In this study we compare this prototype sequence with rs-DWI-EPI at 3T, in terms of image quality (IQ), lesion conspicuity, and breast imaging
+                                                         reporting and data system (BI-RADS ) score.
+  
+                                                         Methods
+                                                         From September 2017 to August 2018, 15 women with known breast cancer or suspicious breast lesions were included, after providing signed informed
+                                                         consent. Women were scanned with the conventional rs-DWI-EPI and the SMS-ss-DWI-EPI during the same clinical examination on a 3T MAGNETOM
+                                                         Skyra system (Siemens Healthcare, Erlangen, Germany) using a 16-channel bilateral breast coil. Parameters of the rs-DWI-EPI sequence were: TR: 5450
+                                                         ms, TE: 57 ms, FoV: 340 mm, voxel size: 1.2x1.2x5 mm , acquisition time: 4:23 min, b-values: 50, 850 s/mm , SPAIR fat suppression. Parameters of the
+                                                         SMS-ss-DWI-EPI sequence were: TR: 4000 ms, TE: 70 ms, FoV: 360 mm, voxel size: 0.9(i)x0.9(i)x4 mm , acquisition time: 2:45 min, b-values: 50, 400,
+                                                         800 s/mm , SPAIR fat suppression. In addition, the clinical protocol included one pre- and five post-contrast administration regular T1-weighted Dixon
+                                                         acquisitions, ultrafast T1-weighted TWIST acquisitions during the inflow of contrast, and a T2 weighted Dixon acquisition. In total, 33 lesions (27 malignant,
+                                                         5 benign and 1 unknown) were detected on the contrast-enhanced series and described in the clinical MRI reports. Two dedicated breast radiologists (4
+                                                         and 10 years of experience with breast MRI) independently scored both sequences for overall IQ (1: extremely poor to 9: excellent). All lesions were also
+                                                         independently evaluated for conspicuity (1: not visible, 2: visible if location is given, 3: visible), and a BI-RADS score (1 to 5) was given for each lesion.
+                                                         Statistical analysis was performed in SPSS using the Wilcoxon signed-rank test.
+  
+                                                         Results
+                                                         Results are presented in Table 1. Overall IQ was significantly higher for the conventional rs-DWI-EPI than for the SMS-ss-DWI-EPI (p=0.006). Lesion
+                                                         conspicuity scores were significantly higher for SMS-ss-DWI-EPI (p=0.016). Benign lesions had similar conspicuity with both sequences while malignant
+                                                         lesions had significantly higher conspicuity with SMS-ss-DWI-EPI (p=0.027) (for example, see Figure 1). There was no significant difference in BI-RADS
+                                                         scores (p=0.151) between the two sequences.
+  
+                                                         Discussion
+                                                         Although the conventional rs-DWI-EPI sequence results in better IQ, in general ss-EPI results in a higher SNR, which may lead to better visibility of
+                                                         malignant lesions with SMS-ss-DWI-EPI. This might eventually improve the clinical value of DWI in addition to contrast enhanced breast MRI.
+                                                         Simultaneous Multi-Slice (SMS) ensures that slices are excited simultaneously with a multiband pulse, which leads to a reduced acquisition time. In our
+                                                         protocol, the combination of ss-EPI and SMS results in a higher spatial resolution while still having a shorter acquisition time than the conventional
+                                                         sequence. The higher achievable spatial resolution may be an important factor for the improved lesion visibility, and conspicuity of malignant lesions. This
+                                                         may make the SMS approach suitable for fast screening and diagnosis of breast cancer. Still, further development of the SMS-ss-DWI-EPI sequence is
+                                                         needed for improved IQ and even better lesion conspicuity. Extension of the data pool and evaluation by additional readers is pending.
+  
+                                                         Conclusion
+                                                         Despite the perceived poorer image quality of the SMS-ss-DWI-EPI sequence, malignant lesions are better visualized using this sequence. When image
+                                                         quality and conspicuity are further improved, this technique might enable improved lesion detection on unenhanced diffusion weighted breast MRI.},
   optnote = {DIAG},
 }
 
@@ -27251,22 +27284,22 @@ @conference{Sand20
   booktitle = {ISMRM Benelux},
   title = {Simultaneous multi-slice single-shot DWI compared to routine read-out-segmented DWI for evaluation of breast lesions},
   abstract = {Synopsis
-                                                       The aim of this study was to compare a prototype simultaneous multi-slice single-shot echo planar imaging (SMS-ss-DWI-EPI) sequence with conventional readout-segmented echo-planar imaging (rs-DWI-EPI) for diffusion-weighted imaging of the breast at 3T magnetic resonance imaging (MRI). A reader study was conducted to evaluate image quality, lesion conspicuity and BI-RADS score. Our results show that although the image quality with the conventional rs-DWI-EPI is superior, malignant lesions have improved visibility with the SMS-ss-DWI-EPI sequence.
-
-                                                       Introduction
-                                                       The addition of diffusion-weighted imaging (DWI) to contrast-enhanced breast MRI improves the classification of breast lesions, which leads in turn to an increased positive predictive value of biopsies. Consequently, DWI with evaluation of the corresponding apparent diffusion coefficient (ADC) is included in most state-of-the-art breast MRI protocols. The echo train of the readout-segmented echo-planar imaging-based DWI sequence (rs-DWI-EPI) was shortened to reduce distortion and improve the resulting image quality. However, this sequence results in a lower signal-to-noise ratio (SNR) than single-shot echo planar imaging (ss-EPI). In practice, detection of lesions on DWI is often problematic due to a relatively low lesion conspicuity. To improve the detectability of lesions and the speed of acquisition, a prototype DWI sequence, the simultaneous multi-slice single-shot DWI-EPI (SMS-ss-DWI-EPI), was developed. In this study, we compare this prototype sequence with rs-DWI-EPI at 3T, in terms of image quality (IQ), lesion conspicuity, and the presence of artifacts.
-
-                                                       Methods
-                                                       From September 2017 to December 2018, 25 women with known breast cancer or suspicious breast lesions were included, after providing signed informed consent. Women were scanned with the conventional rs-DWI-EPI and the SMS-ss-DWI-EPI during the same clinical examination on a 3T MAGNETOM Skyra system (Siemens Healthcare, Erlangen, Germany) using a 16-channel bilateral breast coil. Parameters of the rs-DWI-EPI sequence were: TR: 5450 ms, TE: 57 ms, FoV: 340 mm, voxel size: 1.2x1.2x5 mm , acquisition time: 4:23 min, b-values: 50, 850 s/mm , SPAIR fat suppression. Parameters of the SMS-ss-DWI-EPI sequence were: TR: 4000 ms, TE: 70 ms, FoV: 360 mm, voxel size: 0.9(i)x0.9(i)x4 mm, acquisition time: 2:45 min, b-values: 50, 400, 800 s/mm , SPAIR fat suppression. In addition, the clinical protocol included one pre- and five post-contrast regular T1-weighted Dixon acquisitions, ultrafast T1-weighted TWIST acquisitions during the inflow of contrast, and a T2 weighted Dixon acquisition. In total, 42 malignant (32 invasive ductal carcinomas, 4 invasive lobular carcinomas, 1 ductal carcinoma in situ and 5 other malignant lesions) and 12 benign lesions were detected on the contrast-enhanced series. Malignant lesions had a mean MRI size of 18.7 mm +- 15.1 mm (range: 3 - 92 mm) and benign lesions had a mean size of 5.9 mm +- 3.8 mm (range: 3 - 15 mm). Four dedicated breast radiologists (4 to 15 years of experience with breast MRI) independently scored both sequences for overall IQ (1: extremely poor to 9: excellent). All lesions were also independently evaluated for conspicuity (1: not visible, 2: visible if location is given, 3: visible). Statistical analysis was performed in SPSS using Generalized Linear Models and the Wilcoxon signed-rank test.
-
-                                                       Results
-                                                       Overall IQ was significantly higher for the conventional rs-DWI-EPI (Mean +- SD: 5.5 +- 1.9) than for the SMS-ss-DWI-EPI (Mean +- SD: 4.2 +- 2.0) (p=0.002). Lesion conspicuity scores were significantly higher for SMS-ss-DWI-EPI (p=0.009). Benign lesions had similar conspicuity with both sequences while malignant lesions had significantly higher conspicuity with SMS-ss-DWI-EPI (p=0.041) (for example, see Figure 1).
-                                                       Infolding and ghosting artifacts were scored as disturbing or worse by 2 or more radiologists in 6 and 15 cases, for Resolve and SMS respectively. Distortion artifacts were scored as disturbing or worse in 4 and 17 cases, respectively.
-
-                                                       Discussion: Although the conventional rs-DWI-EPI sequence results in better IQ, in general ss-EPI results in a higher SNR, which may lead to better visibility of malignant lesions with SMS-ss-DWI-EPI. This might eventually improve the clinical value of DWI in addition to contrast enhanced breast MRI. Simultaneous Multi-Slice (SMS) ensures that slices are excited simultaneously with a multiband pulse, which leads to a reduced acquisition time. In our protocol, the combination of ss-EPI and SMS results in a higher spatial resolution while still having a shorter acquisition time than the conventional sequence. The higher achievable spatial resolution may be an important factor for the improved lesion visibility, and conspicuity of malignant lesions. This may make the SMS approach suitable for fast screening and diagnosis of breast cancer. Still, further development of the SMS-ss-DWI-EPI sequence is needed for improved IQ, decreased presence of artifacts and even better lesion conspicuity.
-
-                                                       Conclusion
-                                                       Despite the perceived poorer image quality and the more disturbing presence of artifacts in the SMS-ss-DWI-EPI sequence, malignant lesions are better visualized using this sequence. When image quality and conspicuity are further improved, this technique might enable improved lesion detection on unenhanced diffusion weighted breast MRI.},
+                                                         The aim of this study was to compare a prototype simultaneous multi-slice single-shot echo planar imaging (SMS-ss-DWI-EPI) sequence with conventional readout-segmented echo-planar imaging (rs-DWI-EPI) for diffusion-weighted imaging of the breast at 3T magnetic resonance imaging (MRI). A reader study was conducted to evaluate image quality, lesion conspicuity and BI-RADS score. Our results show that although the image quality with the conventional rs-DWI-EPI is superior, malignant lesions have improved visibility with the SMS-ss-DWI-EPI sequence.
+  
+                                                         Introduction
+                                                         The addition of diffusion-weighted imaging (DWI) to contrast-enhanced breast MRI improves the classification of breast lesions, which leads in turn to an increased positive predictive value of biopsies. Consequently, DWI with evaluation of the corresponding apparent diffusion coefficient (ADC) is included in most state-of-the-art breast MRI protocols. The echo train of the readout-segmented echo-planar imaging-based DWI sequence (rs-DWI-EPI) was shortened to reduce distortion and improve the resulting image quality. However, this sequence results in a lower signal-to-noise ratio (SNR) than single-shot echo planar imaging (ss-EPI). In practice, detection of lesions on DWI is often problematic due to a relatively low lesion conspicuity. To improve the detectability of lesions and the speed of acquisition, a prototype DWI sequence, the simultaneous multi-slice single-shot DWI-EPI (SMS-ss-DWI-EPI), was developed. In this study, we compare this prototype sequence with rs-DWI-EPI at 3T, in terms of image quality (IQ), lesion conspicuity, and the presence of artifacts.
+  
+                                                         Methods
+                                                         From September 2017 to December 2018, 25 women with known breast cancer or suspicious breast lesions were included, after providing signed informed consent. Women were scanned with the conventional rs-DWI-EPI and the SMS-ss-DWI-EPI during the same clinical examination on a 3T MAGNETOM Skyra system (Siemens Healthcare, Erlangen, Germany) using a 16-channel bilateral breast coil. Parameters of the rs-DWI-EPI sequence were: TR: 5450 ms, TE: 57 ms, FoV: 340 mm, voxel size: 1.2x1.2x5 mm , acquisition time: 4:23 min, b-values: 50, 850 s/mm , SPAIR fat suppression. Parameters of the SMS-ss-DWI-EPI sequence were: TR: 4000 ms, TE: 70 ms, FoV: 360 mm, voxel size: 0.9(i)x0.9(i)x4 mm, acquisition time: 2:45 min, b-values: 50, 400, 800 s/mm , SPAIR fat suppression. In addition, the clinical protocol included one pre- and five post-contrast regular T1-weighted Dixon acquisitions, ultrafast T1-weighted TWIST acquisitions during the inflow of contrast, and a T2 weighted Dixon acquisition. In total, 42 malignant (32 invasive ductal carcinomas, 4 invasive lobular carcinomas, 1 ductal carcinoma in situ and 5 other malignant lesions) and 12 benign lesions were detected on the contrast-enhanced series. Malignant lesions had a mean MRI size of 18.7 mm +- 15.1 mm (range: 3 - 92 mm) and benign lesions had a mean size of 5.9 mm +- 3.8 mm (range: 3 - 15 mm). Four dedicated breast radiologists (4 to 15 years of experience with breast MRI) independently scored both sequences for overall IQ (1: extremely poor to 9: excellent). All lesions were also independently evaluated for conspicuity (1: not visible, 2: visible if location is given, 3: visible). Statistical analysis was performed in SPSS using Generalized Linear Models and the Wilcoxon signed-rank test.
+  
+                                                         Results
+                                                         Overall IQ was significantly higher for the conventional rs-DWI-EPI (Mean +- SD: 5.5 +- 1.9) than for the SMS-ss-DWI-EPI (Mean +- SD: 4.2 +- 2.0) (p=0.002). Lesion conspicuity scores were significantly higher for SMS-ss-DWI-EPI (p=0.009). Benign lesions had similar conspicuity with both sequences while malignant lesions had significantly higher conspicuity with SMS-ss-DWI-EPI (p=0.041) (for example, see Figure 1).
+                                                         Infolding and ghosting artifacts were scored as disturbing or worse by 2 or more radiologists in 6 and 15 cases, for Resolve and SMS respectively. Distortion artifacts were scored as disturbing or worse in 4 and 17 cases, respectively.
+  
+                                                         Discussion: Although the conventional rs-DWI-EPI sequence results in better IQ, in general ss-EPI results in a higher SNR, which may lead to better visibility of malignant lesions with SMS-ss-DWI-EPI. This might eventually improve the clinical value of DWI in addition to contrast enhanced breast MRI. Simultaneous Multi-Slice (SMS) ensures that slices are excited simultaneously with a multiband pulse, which leads to a reduced acquisition time. In our protocol, the combination of ss-EPI and SMS results in a higher spatial resolution while still having a shorter acquisition time than the conventional sequence. The higher achievable spatial resolution may be an important factor for the improved lesion visibility, and conspicuity of malignant lesions. This may make the SMS approach suitable for fast screening and diagnosis of breast cancer. Still, further development of the SMS-ss-DWI-EPI sequence is needed for improved IQ, decreased presence of artifacts and even better lesion conspicuity.
+  
+                                                         Conclusion
+                                                         Despite the perceived poorer image quality and the more disturbing presence of artifacts in the SMS-ss-DWI-EPI sequence, malignant lesions are better visualized using this sequence. When image quality and conspicuity are further improved, this technique might enable improved lesion detection on unenhanced diffusion weighted breast MRI.},
   optnote = {DIAG},
   year = {2020},
 }
@@ -27298,11 +27331,11 @@ @article{Sand20b
   url = {http://dx.doi.org/10.1007/s10549-020-05814-z},
   volume = {184},
   abstract = {Abstract
-                         Purpose
-                         To assess the feasibility of completely excising small breast cancers using the automated, image-guided, single-pass radiofrequency-based breast lesion excision system (BLES) under ultrasound (US) guidance. Methods
-                         From February 2018 to July 2019, 22 patients diagnosed with invasive carcinomas <= 15 mm at US and mammography were enrolled in this prospective, multi-center, ethics board-approved study. Patients underwent breast MRI to verify lesion size. BLES-based excision and surgery were performed during the same procedure. Histopathology findings from the BLES procedure and surgery were compared, and total excision findings were assessed. Results
-                         Of the 22 patients, ten were excluded due to the lesion being &gt; 15 mm and/or being multifocal at MRI, and one due to scheduling issues. The remaining 11 patients underwent BLES excision. Mean diameter of excised lesions at MRI was 11.8 mm (range 8.0-13.9 mm). BLES revealed ten (90.9%) invasive carcinomas of no special type, and one (9.1%) invasive lobular carcinoma. Histopathological results were identical for the needle biopsy, BLES, and surgical specimens for all lesions. None of the BLES excisions were adequate. Margins were usually compromised on both sides of the specimen, indicating that the excised volume was too small. Margin assessment was good for all BLES specimens. One technical complication occurred (retrieval of an empty BLES basket, specimen retrieved during subsequent surgery). Conclusions
-                         BLES allows accurate diagnosis of small invasive breast carcinomas. However, BLES cannot be considered as a therapeutic device for small invasive breast carcinomas due to not achieving adequate excision.},
+                           Purpose
+                           To assess the feasibility of completely excising small breast cancers using the automated, image-guided, single-pass radiofrequency-based breast lesion excision system (BLES) under ultrasound (US) guidance. Methods
+                           From February 2018 to July 2019, 22 patients diagnosed with invasive carcinomas <= 15 mm at US and mammography were enrolled in this prospective, multi-center, ethics board-approved study. Patients underwent breast MRI to verify lesion size. BLES-based excision and surgery were performed during the same procedure. Histopathology findings from the BLES procedure and surgery were compared, and total excision findings were assessed. Results
+                           Of the 22 patients, ten were excluded due to the lesion being &gt; 15 mm and/or being multifocal at MRI, and one due to scheduling issues. The remaining 11 patients underwent BLES excision. Mean diameter of excised lesions at MRI was 11.8 mm (range 8.0-13.9 mm). BLES revealed ten (90.9%) invasive carcinomas of no special type, and one (9.1%) invasive lobular carcinoma. Histopathological results were identical for the needle biopsy, BLES, and surgical specimens for all lesions. None of the BLES excisions were adequate. Margins were usually compromised on both sides of the specimen, indicating that the excised volume was too small. Margin assessment was good for all BLES specimens. One technical complication occurred (retrieval of an empty BLES basket, specimen retrieved during subsequent surgery). Conclusions
+                           BLES allows accurate diagnosis of small invasive breast carcinomas. However, BLES cannot be considered as a therapeutic device for small invasive breast carcinomas due to not achieving adequate excision.},
   all_ss_ids = {[7e887b42074a0fe22a1edb7d438462a299b129fc]},
   automatic = {yes},
   citation-count = {3},
@@ -27955,22 +27988,22 @@ @article{Scha21a
   url = {http://dx.doi.org/10.1007/s00330-021-07798-w},
   volume = {31},
   abstract = {Abstract
-                                         Objectives
-                                         The individual course of disease in idiopathic pulmonary fibrosis (IPF) is highly variable. Assessment of disease activity and prospective estimation of disease progression might have the potential to improve therapy management and indicate the onset of treatment at an earlier stage. The aim of this study was to evaluate whether regional ventilation, lung perfusion, and late enhancement can serve as early imaging markers for disease progression in patients with IPF.
-
-                                         Methods
-                                         In this retrospective study, contrast-enhanced dual-energy CT scans of 32 patients in inspiration and delayed expiration were performed at two time points with a mean interval of 15.4 months. The pulmonary blood volume (PBV) images obtained in the arterial and delayed perfusion phase served as a surrogate for arterial lung perfusion and parenchymal late enhancement. The virtual non-contrast (VNC) images in inspiration and expiration were non-linearly registered to provide regional ventilation images. Image-derived parameters were correlated with longitudinal changes of lung function (FVC%, DLCO%), mean lung density in CT, and CT-derived lung volume.
-
-                                         Results
-                                         Regional ventilation and late enhancement at baseline preceded future change in lung volume (R - 0.474, p 0.006/R - 0.422, p 0.016, respectively) and mean lung density (R - 0.469, p 0.007/R - 0.402, p 0.022, respectively). Regional ventilation also correlated with a future change in FVC% (R - 0.398, p 0.024).
-
-                                         Conclusion
-                                         CT-derived functional parameters of regional ventilation and parenchymal late enhancement are potential early imaging markers for idiopathic pulmonary fibrosis progression.
-
-                                         Key Points
-                                         * Functional CT parameters at baseline (regional ventilation and late enhancement) correlate with future structural changes of the lung as measured with loss of lung volume and increase in lung density in serial CT scans of patients with idiopathic pulmonary fibrosis.
-                                         * Functional CT parameter measurements in high-attenuation areas (- 600 to - 250 HU) are significantly different from normal-attenuation areas (- 950 to - 600 HU) of the lung.
-                                         * Mean regional ventilation in functional CT correlates with a future change in forced vital capacity (FVC) in pulmonary function tests.},
+                                           Objectives
+                                           The individual course of disease in idiopathic pulmonary fibrosis (IPF) is highly variable. Assessment of disease activity and prospective estimation of disease progression might have the potential to improve therapy management and indicate the onset of treatment at an earlier stage. The aim of this study was to evaluate whether regional ventilation, lung perfusion, and late enhancement can serve as early imaging markers for disease progression in patients with IPF.
+  
+                                           Methods
+                                           In this retrospective study, contrast-enhanced dual-energy CT scans of 32 patients in inspiration and delayed expiration were performed at two time points with a mean interval of 15.4 months. The pulmonary blood volume (PBV) images obtained in the arterial and delayed perfusion phase served as a surrogate for arterial lung perfusion and parenchymal late enhancement. The virtual non-contrast (VNC) images in inspiration and expiration were non-linearly registered to provide regional ventilation images. Image-derived parameters were correlated with longitudinal changes of lung function (FVC%, DLCO%), mean lung density in CT, and CT-derived lung volume.
+  
+                                           Results
+                                           Regional ventilation and late enhancement at baseline preceded future change in lung volume (R - 0.474, p 0.006/R - 0.422, p 0.016, respectively) and mean lung density (R - 0.469, p 0.007/R - 0.402, p 0.022, respectively). Regional ventilation also correlated with a future change in FVC% (R - 0.398, p 0.024).
+  
+                                           Conclusion
+                                           CT-derived functional parameters of regional ventilation and parenchymal late enhancement are potential early imaging markers for idiopathic pulmonary fibrosis progression.
+  
+                                           Key Points
+                                           * Functional CT parameters at baseline (regional ventilation and late enhancement) correlate with future structural changes of the lung as measured with loss of lung volume and increase in lung density in serial CT scans of patients with idiopathic pulmonary fibrosis.
+                                           * Functional CT parameter measurements in high-attenuation areas (- 600 to - 250 HU) are significantly different from normal-attenuation areas (- 950 to - 600 HU) of the lung.
+                                           * Mean regional ventilation in functional CT correlates with a future change in forced vital capacity (FVC) in pulmonary function tests.},
   all_ss_ids = {[e3b4c600258e54618755f356e50e9627cca183dc]},
   automatic = {yes},
   citation-count = {12},
@@ -28022,22 +28055,22 @@ @article{Scha22b
   url = {http://dx.doi.org/10.1007/s00330-022-08702-w},
   volume = {32},
   abstract = {Abstract
-                                         Objectives
-                                         Idiopathic pulmonary fibrosis (IPF) is a disease with a poor prognosis and a highly variable course. Pathologically increased ventilation--accessible by functional CT--is discussed as a potential predecessor of lung fibrosis. The purpose of this feasibility study was to investigate whether increased regional ventilation at baseline CT and morphological changes in the follow-up CT suggestive for fibrosis indeed occur in spatial correspondence.
-
-                                         Methods
-                                         In this retrospective study, CT scans were performed at two time points between September 2016 and November 2020. Baseline ventilation was divided into four categories ranging from low, normal to moderately, and severely increased (C1-C4). Correlation between baseline ventilation and volume and density change at follow-up was investigated in corresponding voxels. The significance of the difference of density and volume change per ventilation category was assessed using paired t-tests with a significance level of p <= 0.05. The analysis was performed separately for normal (NAA) and high attenuation areas (HAA).
-
-                                         Results
-                                         The study group consisted of 41 patients (73 +- 10 years, 36 men). In both NAA and HAA, significant increases of density and loss of volume were seen in areas of severely increased ventilation (C4) at baseline compared to areas of normal ventilation (C2, p &lt; 0.001). In HAA, morphological changes were more heterogeneous compared to NAA.
-
-                                         Conclusion
-                                         Functional CT assessing the extent and distribution of lung parenchyma with pathologically increased ventilation may serve as an imaging marker to prospectively identify lung parenchyma at risk for developing fibrosis.
-
-                                         Key Points
-                                         * Voxelwise correlation of serial CT scans suggests spatial correspondence between increased ventilation at baseline and structural changes at follow-up.
-                                         * Regional assessment of pathologically increased ventilation at baseline has the potential to prospectively identify tissue at risk for developing fibrosis.
-                                         * Presence and extent of pathologically increased ventilation may serve as an early imaging marker of disease activity.},
+                                           Objectives
+                                           Idiopathic pulmonary fibrosis (IPF) is a disease with a poor prognosis and a highly variable course. Pathologically increased ventilation--accessible by functional CT--is discussed as a potential predecessor of lung fibrosis. The purpose of this feasibility study was to investigate whether increased regional ventilation at baseline CT and morphological changes in the follow-up CT suggestive for fibrosis indeed occur in spatial correspondence.
+  
+                                           Methods
+                                           In this retrospective study, CT scans were performed at two time points between September 2016 and November 2020. Baseline ventilation was divided into four categories ranging from low, normal to moderately, and severely increased (C1-C4). Correlation between baseline ventilation and volume and density change at follow-up was investigated in corresponding voxels. The significance of the difference of density and volume change per ventilation category was assessed using paired t-tests with a significance level of p <= 0.05. The analysis was performed separately for normal (NAA) and high attenuation areas (HAA).
+  
+                                           Results
+                                           The study group consisted of 41 patients (73 +- 10 years, 36 men). In both NAA and HAA, significant increases of density and loss of volume were seen in areas of severely increased ventilation (C4) at baseline compared to areas of normal ventilation (C2, p &lt; 0.001). In HAA, morphological changes were more heterogeneous compared to NAA.
+  
+                                           Conclusion
+                                           Functional CT assessing the extent and distribution of lung parenchyma with pathologically increased ventilation may serve as an imaging marker to prospectively identify lung parenchyma at risk for developing fibrosis.
+  
+                                           Key Points
+                                           * Voxelwise correlation of serial CT scans suggests spatial correspondence between increased ventilation at baseline and structural changes at follow-up.
+                                           * Regional assessment of pathologically increased ventilation at baseline has the potential to prospectively identify tissue at risk for developing fibrosis.
+                                           * Presence and extent of pathologically increased ventilation may serve as an early imaging marker of disease activity.},
   all_ss_ids = {[89af59fe91bde6b84ea2c7ff7a901108300a55dc]},
   automatic = {yes},
   citation-count = {4},
@@ -28073,7 +28106,7 @@ @inproceedings{Scha23a
   doi = {10.1183/13993003.congress-2023.pa2278},
   year = {2023},
   abstract = {To determine whether alveolar collapse, defined as an abnormal increase in CT attenuation during expiration, can be detected using paired attenuation histograms in inspiration and expiration and used as a potential predictive marker in IPF patients.   Methods Sixty-six individuals with IPF obtained CT scans during inspiration and expiration. Density histograms were created and analyzed. After each respective 3-year observation period, the patient population was split into two subgroups according to their status (endpoints: death/transplantation vs still under clinical surveillance). An independent t-test was used to compare the CT-derived histogram parameters of attenuation between the two subgroups (ratio of mean attenuation in expiration/inspiration (meanHUratio) and SD, skewness and kurtosis in expiration). Results After the individual observation period of 3 years, 37 patients were still under clinical surveillance while 29 had died or received a transplantation. All baseline characteristics (meanHUratio and SD, skewness and kurtosis of expiratory attenuation histograms) were significantly different between the subgroups (p = 0.004, p = 0.009, p < 0.001 and p < 0.001, respectively). ![Figure][1]</img>   Conclusion Expiratory attenuation histogram analysis can be used to demonstrate the concept of alveolar collapse as a potential prognostic marker in IPF patients. Footnotes  Cite this article as: European Respiratory Journal 2023; 62: Suppl. 67, PA2278. This abstract was presented at the 2023 ERS International Congress, in session "Inflammatory endotyping: the macrophage across disease areas". This is an ERS International Congress abstract. No full-text version is available. Further material to accompany this abstract may be available at [www.ers-education.org][2] (ERS member access only).  [1]: pending:yes
-            [2]: http://www.ers-education.org},
+              [2]: http://www.ers-education.org},
   url = {http://dx.doi.org/10.1183/13993003.congress-2023.pa2278},
   file = {Scha23a.pdf:pdf\\Scha23a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -28090,7 +28123,7 @@ @inproceedings{Scha23b
   doi = {10.1183/13993003.congress-2023.oa4849},
   year = {2023},
   abstract = {Purpose: To investigate whether alveolar collapse detected by an extension of Parametric Response Maps (PRM) can be used as a predictive marker in IPF. Methods: A CT scan was performed in inspiration and expiration on 66 IPF patients. PRMs were created as scatterplots of the voxel-wise attenuation values of the paired inspiration and expiration scans. The PRMs were used to calculate lung parenchymal subvolumes as a function of their attenuation changes during inspiration and expiration. Volumes of "collapsed" lung tissue on expiration were calculated as a percentage of Normal Attenuation Areas (NAA) and High Attenuation Areas (HAA) per patient (NAAcollapse/NAA; HAAcollapse/HAA, respectively). After each respective 3-year period of observation, patients were divided into two subgroups based on their status (endpoints: death and transplantation or still under clinical observation). To compare the named CT parameters obtained at baseline, a Mann-Whitney U test was used. Results: At the end of the 3-year individual follow-up, 37 patients were still under clinical surveillance, whereas 29 patients had died or undergone transplantation. NAAcollapse/NAA and HAAcollapse/HAA differed significantly between subgroups (p = 0.001 and p = 0.002, respectively). ![Figure][1]</img>  Conclusion: The PRM technique can be used to demonstrate the concept of alveolar collapse as a prognostic marker in IPF patients. Footnotes  Cite this article as: European Respiratory Journal 2023; 62: Suppl. 67, OA4849. This abstract was presented at the 2023 ERS International Congress, in session "Inflammatory endotyping: the macrophage across disease areas". This is an ERS International Congress abstract. No full-text version is available. Further material to accompany this abstract may be available at [www.ers-education.org][2] (ERS member access only).  [1]: pending:yes
-           [2]: http://www.ers-education.org},
+             [2]: http://www.ers-education.org},
   url = {http://dx.doi.org/10.1183/13993003.congress-2023.oa4849},
   file = {Scha23b.pdf:pdf\\Scha23b.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
@@ -28106,29 +28139,29 @@ @article{Scha24
   doi = {10.1007/s00330-024-10794-5},
   url = {http://dx.doi.org/10.1007/s00330-024-10794-5},
   abstract = {Abstract
-                     Introduction
-                     This study investigates the performance of a commercially available artificial intelligence (AI) system to identify normal chest radiographs and its potential to reduce radiologist workload.
-
-                     Methods
-                     Retrospective analysis included consecutive chest radiographs from two medical centers between Oct 1, 2016 and Oct 14, 2016. Exclusions comprised follow-up exams within the inclusion period, bedside radiographs, incomplete images, imported radiographs, and pediatric radiographs. Three chest radiologists categorized findings into normal, clinically irrelevant, clinically relevant, urgent, and critical. A commercial AI system processed all radiographs, scoring 10 chest abnormalities on a 0-100 confidence scale. AI system performance was evaluated using the area under the ROC curve (AUC), assessing the detection of normal radiographs. Sensitivity was calculated for the default and a conservative operating point. the detection of negative predictive value (NPV) for urgent and critical findings, as well as the potential workload reduction, was calculated.
-
-                     Results
-                     A total of 2603 radiographs were acquired in 2141 unique patients. Post-exclusion, 1670 radiographs were analyzed. Categories included 479 normal, 332 clinically irrelevant, 339 clinically relevant, 501 urgent, and 19 critical findings. The AI system achieved an AUC of 0.92. Sensitivity for normal radiographs was 92% at default and 53% at the conservative operating point. At the conservative operating point, NPV was 98% for urgent and critical findings, and could result in a 15% workload reduction.
-
-                     Conclusion
-                     A commercially available AI system effectively identifies normal chest radiographs and holds the potential to lessen radiologists' workload by omitting half of the normal exams from reporting.
-
-                     Clinical relevance statement
-                     The AI system is able to detect half of all normal chest radiographs at a clinically acceptable operating point, thereby potentially reducing the workload for the radiologists by 15%.
-
-                     Key Points
-                     <jats:list list-type="bullet">
-                       <jats:list-item>
-                         The AI system reached an AUC of 0.92 for the detection of normal chest radiographs.
-                       </jats:list-item>
-                       <jats:list-item>
-                         Fifty-three percent of normal chest radiographs were identified with a NPV of 98% for urgent findings. AI can reduce the workload of chest radiography reporting by 15%.
-                       </jats:list-item>},
+                       Introduction
+                       This study investigates the performance of a commercially available artificial intelligence (AI) system to identify normal chest radiographs and its potential to reduce radiologist workload.
+  
+                       Methods
+                       Retrospective analysis included consecutive chest radiographs from two medical centers between Oct 1, 2016 and Oct 14, 2016. Exclusions comprised follow-up exams within the inclusion period, bedside radiographs, incomplete images, imported radiographs, and pediatric radiographs. Three chest radiologists categorized findings into normal, clinically irrelevant, clinically relevant, urgent, and critical. A commercial AI system processed all radiographs, scoring 10 chest abnormalities on a 0-100 confidence scale. AI system performance was evaluated using the area under the ROC curve (AUC), assessing the detection of normal radiographs. Sensitivity was calculated for the default and a conservative operating point. the detection of negative predictive value (NPV) for urgent and critical findings, as well as the potential workload reduction, was calculated.
+  
+                       Results
+                       A total of 2603 radiographs were acquired in 2141 unique patients. Post-exclusion, 1670 radiographs were analyzed. Categories included 479 normal, 332 clinically irrelevant, 339 clinically relevant, 501 urgent, and 19 critical findings. The AI system achieved an AUC of 0.92. Sensitivity for normal radiographs was 92% at default and 53% at the conservative operating point. At the conservative operating point, NPV was 98% for urgent and critical findings, and could result in a 15% workload reduction.
+  
+                       Conclusion
+                       A commercially available AI system effectively identifies normal chest radiographs and holds the potential to lessen radiologists' workload by omitting half of the normal exams from reporting.
+  
+                       Clinical relevance statement
+                       The AI system is able to detect half of all normal chest radiographs at a clinically acceptable operating point, thereby potentially reducing the workload for the radiologists by 15%.
+  
+                       Key Points
+                       <jats:list list-type="bullet">
+                         <jats:list-item>
+                           The AI system reached an AUC of 0.92 for the detection of normal chest radiographs.
+                         </jats:list-item>
+                         <jats:list-item>
+                           Fifty-three percent of normal chest radiographs were identified with a NPV of 98% for urgent findings. AI can reduce the workload of chest radiography reporting by 15%.
+                         </jats:list-item>},
   all_ss_ids = {['55265aa49f1db2c23456ad8477839dc3d625525c']},
   automatic = {yes},
   citation-count = {0},
@@ -28535,22 +28568,22 @@ @mastersthesis{Schi20
   author = {Martijn Schilpzand},
   title = {Automatic Placenta Localisation from Ultrasound Imaging in a Resource-Limited Setting},
   abstract = {Placenta previa and low-lying placenta are dangerous conditions that can cause severe
-                              maternal and fetal complications. Obstetric ultrasound imaging is commonly used to
-                              detect these maternal risk factors. Unfortunately, low-income countries suffer from a
-                              shortage of trained sonographers to perform ultrasound examinations. To address this
-                              problem, this study presents an algorithm to automatically detect low-lying placenta or
-                              placenta previa from ultrasound data acquired with a standardized acquisition protocol.
-                              This acquisition protocol can be taught to any healthcare worker within two hours. The
-                              detection algorithm was optimized for performance and efficiency so that it can run on
-                              a smartphone in combination with low-cost ultrasound equipment. The dataset used
-                              in this study originates from St. Luke's hospital in Wolisso, Ethiopia and was acquired
-                              with a low-cost ultrasound device. The detection algorithm consisted of two parts. First,
-                              the placenta was segmented by a deep learning model with a U-Net architecture. This
-                              segmentation model achieved a median test Dice of 0.835 on 2D ultrasound images.
-                              Then, the segmentation data was used as input for a binary classifier which classified a
-                              case as either normal placenta or as a class which includes both low-lying placenta and
-                              placenta previa. The classification model achieved a sensitivity of 85% and a specificity
-                              of 86%.},
+                                maternal and fetal complications. Obstetric ultrasound imaging is commonly used to
+                                detect these maternal risk factors. Unfortunately, low-income countries suffer from a
+                                shortage of trained sonographers to perform ultrasound examinations. To address this
+                                problem, this study presents an algorithm to automatically detect low-lying placenta or
+                                placenta previa from ultrasound data acquired with a standardized acquisition protocol.
+                                This acquisition protocol can be taught to any healthcare worker within two hours. The
+                                detection algorithm was optimized for performance and efficiency so that it can run on
+                                a smartphone in combination with low-cost ultrasound equipment. The dataset used
+                                in this study originates from St. Luke's hospital in Wolisso, Ethiopia and was acquired
+                                with a low-cost ultrasound device. The detection algorithm consisted of two parts. First,
+                                the placenta was segmented by a deep learning model with a U-Net architecture. This
+                                segmentation model achieved a median test Dice of 0.835 on 2D ultrasound images.
+                                Then, the segmentation data was used as input for a binary classifier which classified a
+                                case as either normal placenta or as a class which includes both low-lying placenta and
+                                placenta previa. The classification model achieved a sensitivity of 85% and a specificity
+                                of 86%.},
   file = {Schi20.pdf:pdf/Schi20.pdf:PDF},
   optnote = {DIAG},
   school = {Radboud University Nijmegen},
@@ -28874,9 +28907,9 @@ @article{Schr18
   pages = {626--633},
   doi = {10.1136/thoraxjnl-2017-211107},
   abstract = {Background: All lung cancer CT screening trials used fixed follow-up intervals, which may not be optimal. We developed new lung cancer risk models for personalising screening intervals to 1 year or 2 years, and compared these with existing models.
-                                                       Methods: We included participants in the CT arm of the National Lung Screening Trial (2002-2010) who underwent a baseline scan and a first annual follow-up scan and were not diagnosed with lung cancer in the first year. True and false positives and the area under the curve of each model were calculated. Internal validation was performed using bootstrapping.
-                                                       Results: Data from 24 542 participants were included in the analysis. The accuracy was 0.785, 0.693, 0.697, 0.666 and 0.727 for the polynomial, patient characteristics, diameter, Patz and PanCan models, respectively. Of the 24 542 participants included, 174 (0.71%) were diagnosed with lung cancer between the first and the second annual follow-ups. Using the polynomial model, 2558 (10.4%, 95% CI 10.0% to 10.8%), 7544 (30.7%, 30.2% to 31.3%), 10 947 (44.6%, 44.0% to 45.2%), 16 710 (68.1%, 67.5% to 68.7%) and 20 023 (81.6%, 81.1% to 92.1%) of the 24 368 participants who did not develop lung cancer in the year following the first follow-up screening round could have safely skipped it, at the expense of delayed diagnosis of 0 (0.0%, 0.0% to 2.7%), 8 (4.6%, 2.2% to 9.2%), 17 (9.8%, 6.0% to 15.4%), 44 (25.3%, 19.2% to 32.5%) and 70 (40.2%, 33.0% to 47.9%) of the 174 lung cancers, respectively.
-                                                       Conclusions: The polynomial model, using both patient characteristics and baseline scan morphology, was significantly superior in assigning participants to 1-year or 2-year screening intervals. Implementing personalised follow-up intervals would enable hundreds of participants to skip a screening round per lung cancer diagnosis delayed.},
+                                                         Methods: We included participants in the CT arm of the National Lung Screening Trial (2002-2010) who underwent a baseline scan and a first annual follow-up scan and were not diagnosed with lung cancer in the first year. True and false positives and the area under the curve of each model were calculated. Internal validation was performed using bootstrapping.
+                                                         Results: Data from 24 542 participants were included in the analysis. The accuracy was 0.785, 0.693, 0.697, 0.666 and 0.727 for the polynomial, patient characteristics, diameter, Patz and PanCan models, respectively. Of the 24 542 participants included, 174 (0.71%) were diagnosed with lung cancer between the first and the second annual follow-ups. Using the polynomial model, 2558 (10.4%, 95% CI 10.0% to 10.8%), 7544 (30.7%, 30.2% to 31.3%), 10 947 (44.6%, 44.0% to 45.2%), 16 710 (68.1%, 67.5% to 68.7%) and 20 023 (81.6%, 81.1% to 92.1%) of the 24 368 participants who did not develop lung cancer in the year following the first follow-up screening round could have safely skipped it, at the expense of delayed diagnosis of 0 (0.0%, 0.0% to 2.7%), 8 (4.6%, 2.2% to 9.2%), 17 (9.8%, 6.0% to 15.4%), 44 (25.3%, 19.2% to 32.5%) and 70 (40.2%, 33.0% to 47.9%) of the 174 lung cancers, respectively.
+                                                         Conclusions: The polynomial model, using both patient characteristics and baseline scan morphology, was significantly superior in assigning participants to 1-year or 2-year screening intervals. Implementing personalised follow-up intervals would enable hundreds of participants to skip a screening round per lung cancer diagnosis delayed.},
   file = {:pdf/Schr18.pdf:PDF;:pdf/Schr18_Appendix.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {29602813},
@@ -29304,10 +29337,10 @@ @article{Sech20
   title = {Artificial Intelligence for Breast Cancer Detection in Mammography: state of the art},
   doi = {10.1016/j.semcancer.2020.06.002},
   abstract = {Screening for breast cancer with mammography has been introduced in various countries over the last 30 years, initially using analog screen-film-based systems and, over the last 20 years, transitioning to the use of fully digital systems. With the introduction of digitization, the computer interpretation of images has been a subject of intense interest, resulting in the introduction of computer-aided detection (CADe) and diagnosis (CADx) algorithms in the early 2000's. Although they were introduced with high expectations, the potential improvement in the clinical realm failed to materialize, mostly due to the high number of false positive marks per analyzed image.
-
-                                                      In the last five years, the artificial intelligence (AI) revolution in computing, driven mostly by deep learning and convolutional neural networks, has also pervaded the field of automated breast cancer detection in digital mammography and digital breast tomosynthesis. Research in this area first involved comparison of its capabilities to that of conventional CADe/CADx methods, which quickly demonstrated the potential of this new technology. In the last couple of years, more mature and some commercial products have been developed, and studies of their performance compared to that of experienced breast radiologists are showing that these algorithms are on par with human-performance levels in retrospective data sets. Although additional studies, especially prospective evaluations performed in the real screening environment, are needed, it is becoming clear that AI will have an important role in the future breast cancer screening realm. Exactly how this new player will shape this field remains to be determined, but recent studies are already evaluating different options for implementation of this technology.
-
-                                                      The aim of this review is to provide an overview of the basic concepts and developments in the field AI for breast cancer detection in digital mammography and digital breast tomosynthesis. The pitfalls of conventional methods, and how these are, for the most part, avoided by this new technology, will be discussed. Importantly, studies that have evaluated the current capabilities of AI and proposals for how these capabilities should be leveraged in the clinical realm will be reviewed, while the questions that need to be answered before this vision becomes a reality are posed.},
+  
+                                                        In the last five years, the artificial intelligence (AI) revolution in computing, driven mostly by deep learning and convolutional neural networks, has also pervaded the field of automated breast cancer detection in digital mammography and digital breast tomosynthesis. Research in this area first involved comparison of its capabilities to that of conventional CADe/CADx methods, which quickly demonstrated the potential of this new technology. In the last couple of years, more mature and some commercial products have been developed, and studies of their performance compared to that of experienced breast radiologists are showing that these algorithms are on par with human-performance levels in retrospective data sets. Although additional studies, especially prospective evaluations performed in the real screening environment, are needed, it is becoming clear that AI will have an important role in the future breast cancer screening realm. Exactly how this new player will shape this field remains to be determined, but recent studies are already evaluating different options for implementation of this technology.
+  
+                                                        The aim of this review is to provide an overview of the basic concepts and developments in the field AI for breast cancer detection in digital mammography and digital breast tomosynthesis. The pitfalls of conventional methods, and how these are, for the most part, avoided by this new technology, will be discussed. Importantly, studies that have evaluated the current capabilities of AI and proposals for how these capabilities should be leveraged in the clinical realm will be reviewed, while the questions that need to be answered before this vision becomes a reality are posed.},
   file = {Sech20.pdf:pdf/Sech20.pdf:PDF},
   journal = {Seminars in Cancer Biology},
   optnote = {DIAG, INPRESS, RADIOLOGY},
@@ -29547,16 +29580,16 @@ @article{Sier20
   volume = 30,
   pages = {3198-3209},
   abstract = {Objectives
-                                                      The diagnostic reading of follow-up low-dose whole-body computed tomography (WBCT) examinations in patients with multiple myeloma (MM) is a demanding process. This study aimed to evaluate the diagnostic accuracy and benefit of a novel software program providing rapid-subtraction maps for bone lesion change detection.
-
-                                                      Methods
-                                                      Sixty patients (66 years +- 10 years) receiving 120 WBCT examinations for follow-up evaluation of MM bone disease were identified from our imaging archive. The median follow-up time was 292 days (range 200-641 days). Subtraction maps were calculated from 2-mm CT images using a nonlinear deformation algorithm. Reading time, correctly assessed lesions, and disease classification were compared to a standard reading software program. De novo clinical reading by a senior radiologist served as the reference standard. Statistics included Wilcoxon rank-sum test, Cohen's kappa coefficient, and calculation of sensitivity, specificity, positive/negative predictive value, and accuracy.
-
-                                                      Results
-                                                      Calculation time for subtraction maps was 84 s +- 24 s. Both readers reported exams faster using subtraction maps (reader A, 438 s +- 133 s; reader B, 1049 s +- 438 s) compared to PACS software (reader A, 534 s +- 156 s; reader B, 1486 s +- 587 s; p < 0.01). The course of disease was correctly classified by both methods in all patients. Sensitivity for lesion detection in subtraction maps/conventional reading was 92%/80% for reader A and 88%/76% for reader B. Specificity was 98%/100% for reader A and 95%/96% for reader B.
-
-                                                      Conclusion
-                                                      A software program for the rapid-subtraction map calculation of follow-up WBCT scans has been successfully tested and seems suited for application in clinical routine. Subtraction maps significantly facilitated reading of WBCTs by reducing reading time and increasing sensitivity.},
+                                                        The diagnostic reading of follow-up low-dose whole-body computed tomography (WBCT) examinations in patients with multiple myeloma (MM) is a demanding process. This study aimed to evaluate the diagnostic accuracy and benefit of a novel software program providing rapid-subtraction maps for bone lesion change detection.
+  
+                                                        Methods
+                                                        Sixty patients (66 years +- 10 years) receiving 120 WBCT examinations for follow-up evaluation of MM bone disease were identified from our imaging archive. The median follow-up time was 292 days (range 200-641 days). Subtraction maps were calculated from 2-mm CT images using a nonlinear deformation algorithm. Reading time, correctly assessed lesions, and disease classification were compared to a standard reading software program. De novo clinical reading by a senior radiologist served as the reference standard. Statistics included Wilcoxon rank-sum test, Cohen's kappa coefficient, and calculation of sensitivity, specificity, positive/negative predictive value, and accuracy.
+  
+                                                        Results
+                                                        Calculation time for subtraction maps was 84 s +- 24 s. Both readers reported exams faster using subtraction maps (reader A, 438 s +- 133 s; reader B, 1049 s +- 438 s) compared to PACS software (reader A, 534 s +- 156 s; reader B, 1486 s +- 587 s; p < 0.01). The course of disease was correctly classified by both methods in all patients. Sensitivity for lesion detection in subtraction maps/conventional reading was 92%/80% for reader A and 88%/76% for reader B. Specificity was 98%/100% for reader A and 95%/96% for reader B.
+  
+                                                        Conclusion
+                                                        A software program for the rapid-subtraction map calculation of follow-up WBCT scans has been successfully tested and seems suited for application in clinical routine. Subtraction maps significantly facilitated reading of WBCTs by reducing reading time and increasing sensitivity.},
   file = {Sier20.pdf:pdf\\Sier20.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {32048038},
@@ -29729,19 +29762,19 @@ @mastersthesis{Sloo20
   author = {Ilse Slootweg},
   title = {Patient variables related to false predictions of deep-learning assisted prostate cancer detection in MRI},
   abstract = {Background:
-                            DL-CAD for prediction of clinically significant prostate cancer (csPCa) in mpMRI is developed to aid radiologists in PI-RADS evaluation. DL-CAD predictions have low accuracy, possibly due to clinical risk factors of csPCa that are not taken into account by DL-CAD.
-
-                            Purpose:
-                            Aim is to identify patient subgroups of clinical characteristics in which DL-CAD predictions differ from radiologists.
-
-                            Methods:
-                            DL-CAD was applied to a test cohort of men examined for PCa according to PI-RADSv2 between 2016 and 2017. Ground truth was provided by manually annotated PI-RADS >=4 lesions. Patient age and PSA were derived from the electronic patient record and other variables were mined from the written radiological reports. False and correct predicted patients were compared on variable distributions and false positive rates were compared between variable categories.
-
-                            Results:
-                            CsPCa was predicted for a total of 482 men (36.9% PIRADS >=4). Benign and malignant patients statistically differed on all clinical variables (P<.05). DL-CAD negative predictive value and positive predictive value were 0.912 and 0.457, respectively. False and correct positive predicted patients significantly differed on age (P<.05), PSA (P<.001), and PSAD (P<.001) as well as prostate volume (P<.001), number of lesions (P<.001), and number of affected zones (P<.001). Analysis of negative predictions was inconclusive due to small population size.
-
-                            Conclusions:
-                            False positive DL-CAD csPCa predictions are due to unavailable clinical variables that are used in radiologists' PI-RADS risk assessment. We advise to study the effect of including age, PSA and PSAD information in DL-CAD input on prediction accuracy.},
+                              DL-CAD for prediction of clinically significant prostate cancer (csPCa) in mpMRI is developed to aid radiologists in PI-RADS evaluation. DL-CAD predictions have low accuracy, possibly due to clinical risk factors of csPCa that are not taken into account by DL-CAD.
+  
+                              Purpose:
+                              Aim is to identify patient subgroups of clinical characteristics in which DL-CAD predictions differ from radiologists.
+  
+                              Methods:
+                              DL-CAD was applied to a test cohort of men examined for PCa according to PI-RADSv2 between 2016 and 2017. Ground truth was provided by manually annotated PI-RADS >=4 lesions. Patient age and PSA were derived from the electronic patient record and other variables were mined from the written radiological reports. False and correct predicted patients were compared on variable distributions and false positive rates were compared between variable categories.
+  
+                              Results:
+                              CsPCa was predicted for a total of 482 men (36.9% PIRADS >=4). Benign and malignant patients statistically differed on all clinical variables (P<.05). DL-CAD negative predictive value and positive predictive value were 0.912 and 0.457, respectively. False and correct positive predicted patients significantly differed on age (P<.05), PSA (P<.001), and PSAD (P<.001) as well as prostate volume (P<.001), number of lesions (P<.001), and number of affected zones (P<.001). Analysis of negative predictions was inconclusive due to small population size.
+  
+                              Conclusions:
+                              False positive DL-CAD csPCa predictions are due to unavailable clinical variables that are used in radiologists' PI-RADS risk assessment. We advise to study the effect of including age, PSA and PSAD information in DL-CAD input on prediction accuracy.},
   file = {:pdf/Sloo20.pdf:PDF},
   optnote = {DIAG},
   school = {Radboud University Medical Center},
@@ -29893,14 +29926,14 @@ @article{Slui23
   url = {http://dx.doi.org/10.1371/journal.pone.0290118},
   volume = {18},
   abstract = {<jats:sec id="sec001">
-                         Background
-                         Ethnicity impacts cardiovascular disease (CVD) risk, and South Asians demonstrate a higher risk than White Europeans. Arterial stiffness is known to contribute to CVD, and differences in arterial stiffness between ethnicities could explain the disparity in CVD risk. We compared central and local arterial stiffness between White Europeans and South Asians and investigated which factors are associated with arterial stiffness. <jats:sec id="sec002">
-                         Methods
-                         Data were collected from cohorts of White Europeans (the Netherlands) and South Asians (India). We matched cohorts on individual level using age, sex, and body mass index (BMI). Arterial stiffness was measured with ARTSENS(r) Plus. Central stiffness was expressed as carotid-femoral pulse wave velocity (cf-PWV, m/s), and local carotid stiffness was quantified using the carotid stiffness index (Beta) and pressure-strain elastic modulus (Epsilon, kPa). We compared arterial stiffness between cohorts and used multivariable linear regression to identify factors related to stiffness. <jats:sec id="sec003">
-                         Results
-                         We included n = 121 participants per cohort (age 53+-10 years, 55% male, BMI 24 kg/m2). Cf-PWV was lower in White Europeans compared to South Asians (6.8+-1.9 vs. 8.2+-1.8 m/s, p&lt;0.001), but no differences were found for local stiffness parameters Beta (5.4+-2.4 vs. 5.8+-2.3, p = 0.17) and Epsilon (72+-35 vs. 70+-31 kPa, p = 0.56). Age (standardized b, 95% confidence interval: 0.28, 0.17-0.39), systolic blood pressure (0.32, 0.21-0.43), and South Asian ethnicity (0.46, 0.35-0.57) were associated with cf-PWV; associations were similar between cohorts (p&gt;0.05 for interaction). Systolic blood pressure was associated with carotid stiffness in both cohorts, whereas age was associated to carotid stiffness only in South Asians and BMI only in White Europeans. <jats:sec id="sec004">
-                         Conclusion
-                         Ethnicity is associated with central but not local arterial stiffness. Conversely, ethnicity seems to modify associations between CVD risk factors and local but not central arterial stiffness. This suggests that ethnicity interacts with arterial stiffness measures and the association of these measures with CVD risk factors.},
+                           Background
+                           Ethnicity impacts cardiovascular disease (CVD) risk, and South Asians demonstrate a higher risk than White Europeans. Arterial stiffness is known to contribute to CVD, and differences in arterial stiffness between ethnicities could explain the disparity in CVD risk. We compared central and local arterial stiffness between White Europeans and South Asians and investigated which factors are associated with arterial stiffness. <jats:sec id="sec002">
+                           Methods
+                           Data were collected from cohorts of White Europeans (the Netherlands) and South Asians (India). We matched cohorts on individual level using age, sex, and body mass index (BMI). Arterial stiffness was measured with ARTSENS(r) Plus. Central stiffness was expressed as carotid-femoral pulse wave velocity (cf-PWV, m/s), and local carotid stiffness was quantified using the carotid stiffness index (Beta) and pressure-strain elastic modulus (Epsilon, kPa). We compared arterial stiffness between cohorts and used multivariable linear regression to identify factors related to stiffness. <jats:sec id="sec003">
+                           Results
+                           We included n = 121 participants per cohort (age 53+-10 years, 55% male, BMI 24 kg/m2). Cf-PWV was lower in White Europeans compared to South Asians (6.8+-1.9 vs. 8.2+-1.8 m/s, p&lt;0.001), but no differences were found for local stiffness parameters Beta (5.4+-2.4 vs. 5.8+-2.3, p = 0.17) and Epsilon (72+-35 vs. 70+-31 kPa, p = 0.56). Age (standardized b, 95% confidence interval: 0.28, 0.17-0.39), systolic blood pressure (0.32, 0.21-0.43), and South Asian ethnicity (0.46, 0.35-0.57) were associated with cf-PWV; associations were similar between cohorts (p&gt;0.05 for interaction). Systolic blood pressure was associated with carotid stiffness in both cohorts, whereas age was associated to carotid stiffness only in South Asians and BMI only in White Europeans. <jats:sec id="sec004">
+                           Conclusion
+                           Ethnicity is associated with central but not local arterial stiffness. Conversely, ethnicity seems to modify associations between CVD risk factors and local but not central arterial stiffness. This suggests that ethnicity interacts with arterial stiffness measures and the association of these measures with CVD risk factors.},
   all_ss_ids = {[5ef110f24bcba9e18be64a1b9744e5a10b0827e5]},
   automatic = {yes},
   citation-count = {0},
@@ -29917,32 +29950,32 @@ @conference{Smee18
   booktitle = {European Society for Molecular Imaging},
   title = {Tumor heterogeneity as a PET-biomarker predicts overall survival of pancreatic cancer patients},
   abstract = {INTRODUCTION
-                                                       Pancreatic ductal adenocarcinoma (PDAC) shows a 5-year survival rate of 8%[1]. This mortality results from a lack of methods to accurately treat patients[2]. PDAC is remarkable for its fibrotic reaction, which is present at early stages of PDAC development[3]. Components of this environment can be measured on clinical images[4]. PET derived parameters, e.g. SUVmax, have not been able to provide prognostic information. In this study we developed an algorithm based on FDG-PET texture features (TF) that classifies heterogeneous or homogeneous tumors and shows a correlation with overall survival.
-
-
-                                                       METHODS
-                                                       In total, 121 patients with histologically proven PDAC who underwent 18F-FDG PET/CT (Siemens Biograph mCT, Knoxville, US) were selected from the hospital system. Eighty-six EANM reconstructed scans were visually labeled as 'homogenous' or 'heterogeneous' by experienced Nuclear Medicine physicians and served as training set to develop the classifier [5]. All the 121 scans were used as validation set for the correlation with overall survival (OS). Tumors were delineated using 40% threshold of the SUVmax with manual correction. TF were extracted using the PyRadiomcis toolbox [6]. TF were selected and tested for robustness as described in literature [7-9]. The classifier was build using logistic regression. Prognostic impact was assessed by Kaplan Meier survival analysis and log-rank test.
-
-
-                                                       RESULTS
-                                                       Optimal performance of the leave-one-out cross-validation classifier in the training set yielded an accuracy of 0.73 and AUC of 0.71 in classifying PDAC as heterogeneous or homogeneous tumors. Of note, two tumors were visually labeled as homogenous but correctly classifier as heterogeneous by the classifier after review. For the 121 patients the OS of PDAC tumors classified as heterogeneous, was significantly worse than for homogeneous tumors; median OS 69 weeks (95%CI 64 to 91 weeks) versus median 95 weeks (95%CI 76 to 114), p= 0.0285). This is in contrast with single standard PET parameters, single TF or manual labeling, which had no significant prognostic impact.
-
-
-                                                       CONCLUSIONS
-                                                       We developed an algorithm that accurately classifies PDAC as heterogeneous or homogeneous, based on a set of 18F-FDG PET derived texture features. We showed that the classification result has prognostic value, improving upon standard PET derived parameters and single texture-features. Further validation of this algorithm in an external cohort of PDAC patients is ongoing.
-
-
-                                                       REFERENCES
-
-                                                       [1] Siegel, R.L., K.D. Miller, and A. Jemal, Cancer statistics, 2016. CA Cancer J Clin, 2016. 66(1): p. 7-30.
-                                                       [2] Ryan, D.P., T.S. Hong, and N. Bardeesy, Pancreatic adenocarcinoma. N Engl J Med, 2014. 371(11): p. 1039-49.
-                                                       [3] Neesse, A., et al., Stromal biology and therapy in pancreatic cancer: a changing paradigm. Gut, 2015. 64(9): p. 1476-84.
-                                                       [4] Heid, I., et al., Co-clinical Assessment of Tumor Cellularity in Pancreatic Cancer. Clin Cancer Res, 2017. 23(6): p. 1461-1470.
-                                                       [5] Boellaard, R., et al., FDG PET and PET/CT: EANM procedure guidelines for tumour PET imaging: version 1.0. Eur J Nucl Med Mol Imaging, 2010. 37(1): p. 181-200.
-                                                       [6] van Griethuysen, J.J.M., et al., Computational Radiomics System to Decode the Radiographic Phenotype. Cancer Res, 2017. 77(21): p. e104-e107.
-                                                       [7] Yan, J., et al., Impact of Image Reconstruction Settings on Texture Features in 18F-FDG PET. J Nucl Med, 2015. 56(11): p. 1667-73.
-                                                       [8] Leijenaar, R.T., et al., The effect of SUV discretization in quantitative FDG-PET Radiomics: the need for standardized methodology in tumor texture analysis. Sci Rep, 2015. 5: p. 11075.
-                                                       [9] Grootjans, W., et al., The Impact of Optimal Respiratory Gating and Image Noise on Evaluation of Intratumor Heterogeneity on 18F-FDG PET Imaging of Lung Cancer. J Nucl Med, 2016. 57(11): p. 1692-1698.},
+                                                         Pancreatic ductal adenocarcinoma (PDAC) shows a 5-year survival rate of 8%[1]. This mortality results from a lack of methods to accurately treat patients[2]. PDAC is remarkable for its fibrotic reaction, which is present at early stages of PDAC development[3]. Components of this environment can be measured on clinical images[4]. PET derived parameters, e.g. SUVmax, have not been able to provide prognostic information. In this study we developed an algorithm based on FDG-PET texture features (TF) that classifies heterogeneous or homogeneous tumors and shows a correlation with overall survival.
+  
+  
+                                                         METHODS
+                                                         In total, 121 patients with histologically proven PDAC who underwent 18F-FDG PET/CT (Siemens Biograph mCT, Knoxville, US) were selected from the hospital system. Eighty-six EANM reconstructed scans were visually labeled as 'homogenous' or 'heterogeneous' by experienced Nuclear Medicine physicians and served as training set to develop the classifier [5]. All the 121 scans were used as validation set for the correlation with overall survival (OS). Tumors were delineated using 40% threshold of the SUVmax with manual correction. TF were extracted using the PyRadiomcis toolbox [6]. TF were selected and tested for robustness as described in literature [7-9]. The classifier was build using logistic regression. Prognostic impact was assessed by Kaplan Meier survival analysis and log-rank test.
+  
+  
+                                                         RESULTS
+                                                         Optimal performance of the leave-one-out cross-validation classifier in the training set yielded an accuracy of 0.73 and AUC of 0.71 in classifying PDAC as heterogeneous or homogeneous tumors. Of note, two tumors were visually labeled as homogenous but correctly classifier as heterogeneous by the classifier after review. For the 121 patients the OS of PDAC tumors classified as heterogeneous, was significantly worse than for homogeneous tumors; median OS 69 weeks (95%CI 64 to 91 weeks) versus median 95 weeks (95%CI 76 to 114), p= 0.0285). This is in contrast with single standard PET parameters, single TF or manual labeling, which had no significant prognostic impact.
+  
+  
+                                                         CONCLUSIONS
+                                                         We developed an algorithm that accurately classifies PDAC as heterogeneous or homogeneous, based on a set of 18F-FDG PET derived texture features. We showed that the classification result has prognostic value, improving upon standard PET derived parameters and single texture-features. Further validation of this algorithm in an external cohort of PDAC patients is ongoing.
+  
+  
+                                                         REFERENCES
+  
+                                                         [1] Siegel, R.L., K.D. Miller, and A. Jemal, Cancer statistics, 2016. CA Cancer J Clin, 2016. 66(1): p. 7-30.
+                                                         [2] Ryan, D.P., T.S. Hong, and N. Bardeesy, Pancreatic adenocarcinoma. N Engl J Med, 2014. 371(11): p. 1039-49.
+                                                         [3] Neesse, A., et al., Stromal biology and therapy in pancreatic cancer: a changing paradigm. Gut, 2015. 64(9): p. 1476-84.
+                                                         [4] Heid, I., et al., Co-clinical Assessment of Tumor Cellularity in Pancreatic Cancer. Clin Cancer Res, 2017. 23(6): p. 1461-1470.
+                                                         [5] Boellaard, R., et al., FDG PET and PET/CT: EANM procedure guidelines for tumour PET imaging: version 1.0. Eur J Nucl Med Mol Imaging, 2010. 37(1): p. 181-200.
+                                                         [6] van Griethuysen, J.J.M., et al., Computational Radiomics System to Decode the Radiographic Phenotype. Cancer Res, 2017. 77(21): p. e104-e107.
+                                                         [7] Yan, J., et al., Impact of Image Reconstruction Settings on Texture Features in 18F-FDG PET. J Nucl Med, 2015. 56(11): p. 1667-73.
+                                                         [8] Leijenaar, R.T., et al., The effect of SUV discretization in quantitative FDG-PET Radiomics: the need for standardized methodology in tumor texture analysis. Sci Rep, 2015. 5: p. 11075.
+                                                         [9] Grootjans, W., et al., The Impact of Optimal Respiratory Gating and Image Noise on Evaluation of Intratumor Heterogeneity on 18F-FDG PET Imaging of Lung Cancer. J Nucl Med, 2016. 57(11): p. 1692-1698.},
   optnote = {DIAG, RADIOLOGY},
   year = {2018},
 }
@@ -30082,16 +30115,16 @@ @article{Smit23
   year = {2023},
   doi = {https://doi.org/10.1016/j.jpi.2023.100191},
   abstract = {Background
-                          The amount of stroma within the primary tumor is a prognostic parameter for colon cancer patients. This phenomenon can be assessed using the tumor-stroma ratio (TSR), which classifies tumors in stroma-low (<=50% stroma) and stroma-high (>50% stroma). Although the reproducibility for TSR determination is good, improvement might be expected from automation. The aim of this study was to investigate whether the scoring of the TSR in a semi- and fully automated method using deep learning algorithms is feasible.
-
-                          Methods
-                          A series of 75 colon cancer slides were selected from a trial series of the UNITED study. For the standard determination of the TSR, 3 observers scored the histological slides. Next, the slides were digitized, color normalized, and the stroma percentages were scored using semi- and fully automated deep learning algorithms. Correlations were determined using intraclass correlation coefficients (ICCs) and Spearman rank correlations.
-
-                          Results
-                          37 (49%) cases were classified as stroma-low and 38 (51%) as stroma-high by visual estimation. A high level of concordance between the 3 observers was reached, with ICCs of 0.91, 0.89, and 0.94 (all P<.001). Between visual and semi-automated assessment the ICC was 0.78 (95% CI 0.23-0.91, P-value 0.005), with a Spearman correlation of 0.88 (P<.001). Spearman correlation coefficients above 0.70 (N=3) were observed for visual estimation versus the fully automated scoring procedures.
-
-                          Conclusion
-                          Good correlations were observed between standard visual TSR determination and semi- and fully automated TSR scores. At this point, visual examination has the highest observer agreement, but semi-automated scoring could be helpful to support pathologists.},
+                            The amount of stroma within the primary tumor is a prognostic parameter for colon cancer patients. This phenomenon can be assessed using the tumor-stroma ratio (TSR), which classifies tumors in stroma-low (<=50% stroma) and stroma-high (>50% stroma). Although the reproducibility for TSR determination is good, improvement might be expected from automation. The aim of this study was to investigate whether the scoring of the TSR in a semi- and fully automated method using deep learning algorithms is feasible.
+  
+                            Methods
+                            A series of 75 colon cancer slides were selected from a trial series of the UNITED study. For the standard determination of the TSR, 3 observers scored the histological slides. Next, the slides were digitized, color normalized, and the stroma percentages were scored using semi- and fully automated deep learning algorithms. Correlations were determined using intraclass correlation coefficients (ICCs) and Spearman rank correlations.
+  
+                            Results
+                            37 (49%) cases were classified as stroma-low and 38 (51%) as stroma-high by visual estimation. A high level of concordance between the 3 observers was reached, with ICCs of 0.91, 0.89, and 0.94 (all P<.001). Between visual and semi-automated assessment the ICC was 0.78 (95% CI 0.23-0.91, P-value 0.005), with a Spearman correlation of 0.88 (P<.001). Spearman correlation coefficients above 0.70 (N=3) were observed for visual estimation versus the fully automated scoring procedures.
+  
+                            Conclusion
+                            Good correlations were observed between standard visual TSR determination and semi- and fully automated TSR scores. At this point, visual examination has the highest observer agreement, but semi-automated scoring could be helpful to support pathologists.},
   file = {Smit23.pdf:pdf\\Smit23.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   ss_id = {935f46ae7c1a5be1ed7a5e176db38fb919bf30df},
@@ -30252,10 +30285,10 @@ @article{Soga22
   pages = {4466--4477},
   volume = {49},
   abstract = {BACKGROUND: Total lung volume is an important quantitative biomarker and is used for the assessment of restrictive lung diseases.
-                             PURPOSE: In this study, we investigate the performance of several deep-learning approaches for automated measurement of total lung volume from chest radiographs.
-                             METHODS: About 7621 posteroanterior and lateral view chest radiographs (CXR) were collected from patients with chest CT available. Similarly, 928 CXR studies were chosen from patients with pulmonary function test (PFT) results. The reference total lung volume was calculated from lung segmentation on CT or PFT data, respectively. This dataset was used to train deep-learning architectures to predict total lung volume from chest radiographs. The experiments were constructed in a stepwise fashion with increasing complexity to demonstrate the effect of training with CT-derived labels only and the sources of error. The optimal models were tested on 291 CXR studies with reference lung volume obtained from PFT. Mean absolute error (MAE), mean absolute percentage error (MAPE), and Pearson correlation coefficient (Pearson's r) were computed.
-                             RESULTS: The optimal deep-learning regression model showed an MAE of 408 ml and an MAPE of 8.1\% using both frontal and lateral chest radiographs as input. The predictions were highly correlated with the reference standard (Pearson's r = 0.92). CT-derived labels were useful for pretraining but the optimal performance was obtained by fine-tuning the network with PFT-derived labels.
-                             CONCLUSION: We demonstrate, for the first time, that state-of-the-art deep-learning solutions can accurately measure total lung volume from plain chest radiographs. The proposed model is made publicly available and can be used to obtain total lung volume from routinely acquired chest radiographs at no additional cost. This deep-learning system can be a useful tool to identify trends over time in patients referred regularly for chest X-ray.},
+                               PURPOSE: In this study, we investigate the performance of several deep-learning approaches for automated measurement of total lung volume from chest radiographs.
+                               METHODS: About 7621 posteroanterior and lateral view chest radiographs (CXR) were collected from patients with chest CT available. Similarly, 928 CXR studies were chosen from patients with pulmonary function test (PFT) results. The reference total lung volume was calculated from lung segmentation on CT or PFT data, respectively. This dataset was used to train deep-learning architectures to predict total lung volume from chest radiographs. The experiments were constructed in a stepwise fashion with increasing complexity to demonstrate the effect of training with CT-derived labels only and the sources of error. The optimal models were tested on 291 CXR studies with reference lung volume obtained from PFT. Mean absolute error (MAE), mean absolute percentage error (MAPE), and Pearson correlation coefficient (Pearson's r) were computed.
+                               RESULTS: The optimal deep-learning regression model showed an MAE of 408 ml and an MAPE of 8.1\% using both frontal and lateral chest radiographs as input. The predictions were highly correlated with the reference standard (Pearson's r = 0.92). CT-derived labels were useful for pretraining but the optimal performance was obtained by fine-tuning the network with PFT-derived labels.
+                               CONCLUSION: We demonstrate, for the first time, that state-of-the-art deep-learning solutions can accurately measure total lung volume from plain chest radiographs. The proposed model is made publicly available and can be used to obtain total lung volume from routinely acquired chest radiographs at no additional cost. This deep-learning system can be a useful tool to identify trends over time in patients referred regularly for chest X-ray.},
   file = {PubMed entry:http\://www.ncbi.nlm.nih.gov/pubmed/35388486:text/html},
   pmid = {35388486},
   ss_id = {1d5f65d8f721089fca1e7fac8d1ef214f12e2c23},
@@ -30337,7 +30370,7 @@ @mastersthesis{Sons19
   author = {Patrick Sonsma},
   title = {Lymphocyte detection in hematoxylin-eosin stained histopathological images of breast cancer},
   abstract = {Lymphocytes are immune cells that form an important bio-marker in the prognosis of breast cancer. In some cases more effective treatment can be chosen based on the lymphocyte presence near tumor regions. For trained pathologists the detection of lymphocytes in Hematoxylin-Eosin stained images is however a challenging and time intensive task with subjective interpretations. In this research we explore the lymphocyte detection problem with a deep learning approach and strive towards a robust, objective and efficient tool for computer aided diagnosis.
-                             We generate a large data-set with machine produced labels by applying an existing model on destained and restained immunohistochemical histopathological images. On this data we train and evaluate a more minimal rendition of the known YOLO object detection model and report moderate results.},
+                               We generate a large data-set with machine produced labels by applying an existing model on destained and restained immunohistochemical histopathological images. On this data we train and evaluate a more minimal rendition of the known YOLO object detection model and report moderate results.},
   file = {Sons19.pdf:pdf\\Sons19.pdf:PDF},
   optnote = {DIAG},
   school = {Radboud University},
@@ -30387,16 +30420,16 @@ @conference{Spro22
   booktitle = {Immuno-Oncology and Technology},
   year = {2022},
   abstract = {Background
-                          Immunotherapy has become the standard of care for metastatic non-small cell lung cancer (mNSCLC) without a targetable driver alteration, yet we still lack insight into which patients (pts) will benefit from such treatments. To that end, we investigated characteristics of the immune infiltrate in the tumor microenvironment in relation to immunotherapy response. We report the results of an automated deep learning approach applied to digital H&E whole slide images (WSIs) of pre-treatment biopsies from the PEMBRO-RT clinical trial.
-
-                          Methods
-                          61 quality-checked H&E WSIs were processed with 3 deep learning algorithms. We extracted a tissue mask using an existing method (Bandi et al., 2019), and detected tumor and immune cells using HoVerNet (Graham et al., 2019). Tumor clusters were identified by combining the output of HoVerNet and tumor segmentation from an nnUnet (Isensee et al., 2021) model that we trained on external NSCLC images. From the output of this pipeline, we extracted immune infiltrate-based density metrics, calculated over all tissue (allINF), stroma within 500um from the tumor border (sINF), tumor region (tINF), and the combination of stroma and tumor (t+sINF). All metrics were used in ROC analysis after dichotomizing pts as responders and non-responders (response was defined as complete or partial response at any time point or stable disease for >=12 weeks according to RECIST 1.1 measurement). Differences in metric distributions between the two groups were tested with a two-sided Welch t-test. Kaplan-Meier (KM) analysis was performed on progression-free survival (5-year follow-up).
-
-                          Results
-                          Our automated analysis reported denser immune infiltrates in responders, although not statistically significant (0.05<p<=0.2). All immune infiltrate metrics showed some predictive value with AUCs > 0.63, where tINF reported an AUC of 0.70. KM analysis showed p=0.07 if pts were stratified based on the median tINF, and p=0.02 if stratified based on the optimal operating point of its ROC curve.
-
-                          Conclusions
-                          Deep learning models that analyze the immune infiltrate density on H&E WSIs can identify mNSCLC responders to pembrolizumab.},
+                            Immunotherapy has become the standard of care for metastatic non-small cell lung cancer (mNSCLC) without a targetable driver alteration, yet we still lack insight into which patients (pts) will benefit from such treatments. To that end, we investigated characteristics of the immune infiltrate in the tumor microenvironment in relation to immunotherapy response. We report the results of an automated deep learning approach applied to digital H&E whole slide images (WSIs) of pre-treatment biopsies from the PEMBRO-RT clinical trial.
+  
+                            Methods
+                            61 quality-checked H&E WSIs were processed with 3 deep learning algorithms. We extracted a tissue mask using an existing method (Bandi et al., 2019), and detected tumor and immune cells using HoVerNet (Graham et al., 2019). Tumor clusters were identified by combining the output of HoVerNet and tumor segmentation from an nnUnet (Isensee et al., 2021) model that we trained on external NSCLC images. From the output of this pipeline, we extracted immune infiltrate-based density metrics, calculated over all tissue (allINF), stroma within 500um from the tumor border (sINF), tumor region (tINF), and the combination of stroma and tumor (t+sINF). All metrics were used in ROC analysis after dichotomizing pts as responders and non-responders (response was defined as complete or partial response at any time point or stable disease for >=12 weeks according to RECIST 1.1 measurement). Differences in metric distributions between the two groups were tested with a two-sided Welch t-test. Kaplan-Meier (KM) analysis was performed on progression-free survival (5-year follow-up).
+  
+                            Results
+                            Our automated analysis reported denser immune infiltrates in responders, although not statistically significant (0.05<p<=0.2). All immune infiltrate metrics showed some predictive value with AUCs > 0.63, where tINF reported an AUC of 0.70. KM analysis showed p=0.07 if pts were stratified based on the median tINF, and p=0.02 if stratified based on the optimal operating point of its ROC curve.
+  
+                            Conclusions
+                            Deep learning models that analyze the immune infiltrate density on H&E WSIs can identify mNSCLC responders to pembrolizumab.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -30706,8 +30739,8 @@ @mastersthesis{Stoe19
   author = {Emiel Stoelinga},
   title = {Extracting biomarkers from hematoxylin-eosin stained histopathological images of lung cancer},
   abstract = {In this thesis the technique of deep learning was applied to the field of digital pathology, more specifically lung cancer, to extract several different biomarkers. Tertairy lymphoid structures (TLS) have been found to indicate a positive patient prognosis, especially in combination with germinal centers (GC). Therefore, a VGG16-like network was trained to detect TLS and GC in histopathological slides of lung squamous cell carcinoma with F1 scores on the pixel level of 0.922 and 0.802 respectively. Performance on a different held-out test set on the object level was 0.640 and 0.500 for TLS and GC respectively.
-                             Treatment differs per growth pattern of lung adenocarcinoma and variability between pathol ogists in the assessment of lung adenocarcinoma exists. Therefore, a similar VGG16-like network was trained to segment growth patterns of adenocarcinoma in slides of lung tissue with F1 scores on the pixel level of 0.891, 0.524, 0.812 and 0.954 for solid adenocarcinoma, acinar adenocarcinoma, micropapillary adenocarcinoma and non-tumor tissue respectively.
-                             Because the previous system was only trained on sparsely annotated data and consequently did not encounter neighbouring growth patterns of lung adenocarcinoma, a method with genera tive adversarial networks to generate fake densely annotated realistic looking image patches from sparsely annotated data was examined and a comparison between three types of models was made.},
+                               Treatment differs per growth pattern of lung adenocarcinoma and variability between pathol ogists in the assessment of lung adenocarcinoma exists. Therefore, a similar VGG16-like network was trained to segment growth patterns of adenocarcinoma in slides of lung tissue with F1 scores on the pixel level of 0.891, 0.524, 0.812 and 0.954 for solid adenocarcinoma, acinar adenocarcinoma, micropapillary adenocarcinoma and non-tumor tissue respectively.
+                               Because the previous system was only trained on sparsely annotated data and consequently did not encounter neighbouring growth patterns of lung adenocarcinoma, a method with genera tive adversarial networks to generate fake densely annotated realistic looking image patches from sparsely annotated data was examined and a comparison between three types of models was made.},
   file = {Stoe19.pdf:pdf\\Stoe19.pdf:PDF},
   optnote = {DIAG},
   school = {Radboud University},
@@ -30721,37 +30754,37 @@ @conference{Stoi17a
   booktitle = {European Society for Magnetic Resonance in Medicine and Biology},
   year = {2017},
   abstract = {Purpose / Introduction
-                                                       Prostate MRI finds 18% more clinically significant prostate cancer while avoiding 27% biopsies [1]. Reproducibility for multi-parametric T2+DWI+DCE
-                                                       prostate MRI mpMRI is moderate [2] even though a PIRADS reading standard is available [3]. Quantification could help improve reproducibility, which to
-                                                       some extent works for ADC. Scanner provided T2 maps are no solution as it leads to a different texture, lower spatial resolution and increased scan time.
-                                                       We have previously developed a method for normalizing T2-weighted images [4]. The normalized value achieved a diagnostic accuracy AUC of 0.85
-                                                       over 0.64 for the raw T2-weighted values. That method required a separate proton density weighted sequence, an exact knowledge of the sequence
-                                                       model and one reference tissue region. We propose a new method using multiple reference tissues that does not require an additional sequence, nor
-                                                       detailed knowledge about the sequence model. The recent development of deep learning makes it feasible to segment multiple reference tissues. The
-                                                       hypothesis is that the reference tissues allow building a patient specific model to normalize the T2-weighted prostate MR images for quantitative use.
-                                                       Subjects and Methods
-                                                       To test the hypothesis we manually delineated reference tissues and tumor lesions in mpMRI studies of prostate cancer patients. All lesions were
-                                                       interpreted by expert radiologists and assigned a PIRADS score. The normalized T2 was then validated for its ability to discriminate PIRADS 2-3 from 4-5
-                                                       classes. Regions of interest ROI were drawn in four distinct tissue types in fifty T2-weighted images from regular multiparametric prostate MRI mpMRI.
-                                                       The four reference tissue types were: obturator internus muscle, body fat, femoral head, bladder lumen. Four average ROI signals were computed per
-                                                       patient. Each reference tissue was assigned a fixed reference value T2 relaxation found in literature. Per patient, a smooth spline model was fitted to the
-                                                       average, reference pairs. The estimated spline model was then inverted to map patients' raw T2-weighted image scalar values to normalized values. The
-                                                       effect of the normalization was determined by computing and comparing the diagnostic accuracy using ROC analysis.
-                                                       Results
-                                                       The area under the ROC AUC was significantly higher p<0.05 in normalized T2.5/22/2017 #542: Feasibility of multireference tissue normalization of T2weighted prostate MRI.
-
-                                                       Discussion / Conclusion
-                                                       The significant improvement of the diagnostic accuracy demonstrates the potential of our normalization method for the quantitative interpretation of T2-weighted prostate MRI. The results were similar to our previous method.The method still requires manual delineation of multiple reference tissues,
-                                                       however, we will develop deep learning segmentation methods to automate the method and enable regular clinical use.
-                                                       References
-                                                       1. Ahmed, Hashim U., et al. "Diagnostic accuracy of multi?parametric MRI and TRUS biopsy in prostate cancer PROMIS: a paired validating confirmatory
-                                                       study." The Lancet 389.10071 2017: 815?822.
-                                                       2. Rosenkrantz, Andrew B., et al. "Interobserver reproducibility of the PI?RADS version 2 lexicon: a multicenter study of six experienced prostate
-                                                       radiologists." Radiology 280.3 2016: 793?804.
-                                                       3. Barentsz JO, et al. Synopsis of the PI?RADS v2 Guidelines for Multiparametric Prostate Magnetic Resonance Imaging and Recommendations for Use.
-                                                       Eur. Urol. 2016;691:41-49.
-                                                       4. Vos, Pieter C., et al. "Computer?assisted analysis of peripheral zone prostate lesions using T2?weighted and dynamic contrast?enhanced T1?weighted
-                                                       MRI." Physics Med. & Biol. 55.6 2010: 1719},
+                                                         Prostate MRI finds 18% more clinically significant prostate cancer while avoiding 27% biopsies [1]. Reproducibility for multi-parametric T2+DWI+DCE
+                                                         prostate MRI mpMRI is moderate [2] even though a PIRADS reading standard is available [3]. Quantification could help improve reproducibility, which to
+                                                         some extent works for ADC. Scanner provided T2 maps are no solution as it leads to a different texture, lower spatial resolution and increased scan time.
+                                                         We have previously developed a method for normalizing T2-weighted images [4]. The normalized value achieved a diagnostic accuracy AUC of 0.85
+                                                         over 0.64 for the raw T2-weighted values. That method required a separate proton density weighted sequence, an exact knowledge of the sequence
+                                                         model and one reference tissue region. We propose a new method using multiple reference tissues that does not require an additional sequence, nor
+                                                         detailed knowledge about the sequence model. The recent development of deep learning makes it feasible to segment multiple reference tissues. The
+                                                         hypothesis is that the reference tissues allow building a patient specific model to normalize the T2-weighted prostate MR images for quantitative use.
+                                                         Subjects and Methods
+                                                         To test the hypothesis we manually delineated reference tissues and tumor lesions in mpMRI studies of prostate cancer patients. All lesions were
+                                                         interpreted by expert radiologists and assigned a PIRADS score. The normalized T2 was then validated for its ability to discriminate PIRADS 2-3 from 4-5
+                                                         classes. Regions of interest ROI were drawn in four distinct tissue types in fifty T2-weighted images from regular multiparametric prostate MRI mpMRI.
+                                                         The four reference tissue types were: obturator internus muscle, body fat, femoral head, bladder lumen. Four average ROI signals were computed per
+                                                         patient. Each reference tissue was assigned a fixed reference value T2 relaxation found in literature. Per patient, a smooth spline model was fitted to the
+                                                         average, reference pairs. The estimated spline model was then inverted to map patients' raw T2-weighted image scalar values to normalized values. The
+                                                         effect of the normalization was determined by computing and comparing the diagnostic accuracy using ROC analysis.
+                                                         Results
+                                                         The area under the ROC AUC was significantly higher p<0.05 in normalized T2.5/22/2017 #542: Feasibility of multireference tissue normalization of T2weighted prostate MRI.
+  
+                                                         Discussion / Conclusion
+                                                         The significant improvement of the diagnostic accuracy demonstrates the potential of our normalization method for the quantitative interpretation of T2-weighted prostate MRI. The results were similar to our previous method.The method still requires manual delineation of multiple reference tissues,
+                                                         however, we will develop deep learning segmentation methods to automate the method and enable regular clinical use.
+                                                         References
+                                                         1. Ahmed, Hashim U., et al. "Diagnostic accuracy of multi?parametric MRI and TRUS biopsy in prostate cancer PROMIS: a paired validating confirmatory
+                                                         study." The Lancet 389.10071 2017: 815?822.
+                                                         2. Rosenkrantz, Andrew B., et al. "Interobserver reproducibility of the PI?RADS version 2 lexicon: a multicenter study of six experienced prostate
+                                                         radiologists." Radiology 280.3 2016: 793?804.
+                                                         3. Barentsz JO, et al. Synopsis of the PI?RADS v2 Guidelines for Multiparametric Prostate Magnetic Resonance Imaging and Recommendations for Use.
+                                                         Eur. Urol. 2016;691:41-49.
+                                                         4. Vos, Pieter C., et al. "Computer?assisted analysis of peripheral zone prostate lesions using T2?weighted and dynamic contrast?enhanced T1?weighted
+                                                         MRI." Physics Med. & Biol. 55.6 2010: 1719},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -30761,28 +30794,28 @@ @conference{Stoi17b
   booktitle = RSNA,
   year = {2017},
   abstract = {PURPOSE
-                                                       To explore a novel multireferencetissue normalization method applied to t2weighted prostate MRI.
-                                                       METHOD AND MATERIALS
-                                                       Assuming the availability of a set of distinct reference tissue segmentations, the hypothesis is that it allows computing a patient specific sequence model
-                                                       that can normalize MRI. The normalization should produce similar scalar values in the same reference regions for different patients/scanners/sequences
-                                                       and interpolate in between reference values for other tissue areas. Regions of interest (ROI) were drawn in four distinct tissue types in a cohort of sixtyfive t2weighted images from regular multiparametric prostate MRI (mpMRI). The four reference tissue types were: skeletal muscle, body fat, femur
-                                                       head, bladder lumen. Four average ROI signals were computed per patient. Each reference tissue was assigned a fixed reference value (t2 relaxation
-                                                       found in literature). Per patient, a smooth sequence model was fitted to the (average, reference) pairs. The estimated sequence model was then
-                                                       inverted to map patients' raw t2weighted image scalar values to normalized values. To test the method, the effect of normalization on observed
-                                                       variance and tissue discriminability was analyzed. A leaveoneout experiment was performed in which for each ROI its normalized value was computed
-                                                       using the sequence model estimate using the three remaining reference ROIs. The difference between original t2weighted and normalized scalar MRI
-                                                       was analyzed by means of variability and ROC analysis.
-                                                       RESULTS
-                                                       Multireferencetissue normalization significantly (p<0.05) decreased variability and increased the area under the ROC curve for discriminating each
-                                                       reference tissue combination. The ROC curves in the figure show the effect of the normalization (T2n) on the discrimination between body fat and
-                                                       femur head tissue.
-                                                       CONCLUSION
-                                                       Semiautomatic multireferencetissue normalization shows reduced interpatient variability and may allow better quantitative discrimination between
-                                                       tissue types.
-                                                       CLINICAL RELEVANCE/APPLICATION
-                                                       Multireferencetissue t2weighted MRI normalization seems feasible. In combination with automatic segmentation, this could be providing clinical
-                                                       quantitative imaging support to mpMRI diagnosis of prostate cancer. This result motivates us to continue to explore the ability of this novel method to
-                                                       help detect and discriminate prostate cancer in mpMR},
+                                                         To explore a novel multireferencetissue normalization method applied to t2weighted prostate MRI.
+                                                         METHOD AND MATERIALS
+                                                         Assuming the availability of a set of distinct reference tissue segmentations, the hypothesis is that it allows computing a patient specific sequence model
+                                                         that can normalize MRI. The normalization should produce similar scalar values in the same reference regions for different patients/scanners/sequences
+                                                         and interpolate in between reference values for other tissue areas. Regions of interest (ROI) were drawn in four distinct tissue types in a cohort of sixtyfive t2weighted images from regular multiparametric prostate MRI (mpMRI). The four reference tissue types were: skeletal muscle, body fat, femur
+                                                         head, bladder lumen. Four average ROI signals were computed per patient. Each reference tissue was assigned a fixed reference value (t2 relaxation
+                                                         found in literature). Per patient, a smooth sequence model was fitted to the (average, reference) pairs. The estimated sequence model was then
+                                                         inverted to map patients' raw t2weighted image scalar values to normalized values. To test the method, the effect of normalization on observed
+                                                         variance and tissue discriminability was analyzed. A leaveoneout experiment was performed in which for each ROI its normalized value was computed
+                                                         using the sequence model estimate using the three remaining reference ROIs. The difference between original t2weighted and normalized scalar MRI
+                                                         was analyzed by means of variability and ROC analysis.
+                                                         RESULTS
+                                                         Multireferencetissue normalization significantly (p<0.05) decreased variability and increased the area under the ROC curve for discriminating each
+                                                         reference tissue combination. The ROC curves in the figure show the effect of the normalization (T2n) on the discrimination between body fat and
+                                                         femur head tissue.
+                                                         CONCLUSION
+                                                         Semiautomatic multireferencetissue normalization shows reduced interpatient variability and may allow better quantitative discrimination between
+                                                         tissue types.
+                                                         CLINICAL RELEVANCE/APPLICATION
+                                                         Multireferencetissue t2weighted MRI normalization seems feasible. In combination with automatic segmentation, this could be providing clinical
+                                                         quantitative imaging support to mpMRI diagnosis of prostate cancer. This result motivates us to continue to explore the ability of this novel method to
+                                                         help detect and discriminate prostate cancer in mpMR},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -30905,12 +30938,12 @@ @conference{Stud20
   year = {2020},
   optnote = {DIAG, RADIOLOGY},
   abstract = {Background & objectives: Tumour budding, and T-cells are robust prognostic biomarkers in colorectal cancer. A combined analysis is complex and can be greatly expedited and automated using deep learning. The implementation of computer-based analysis in diagnostics is challenging and necessitates extensive validation.
-
-                          Methods: Randomly selected (n=61) double-stained immunohistochemical slides (AE1-AE3 pancytokeratin for tumour buds and CD8 for cytotoxic T-cells) from our pT1 cohort from 3 different institutions were used to validate the deep learning algorithms for tumour budding and CD8 T-cell detection developed by the International Budding Consortium Computational Pathology Group. Staining and scanning were performed in a single laboratory.
-
-                          Results: In the visually identified tumour budding hotspot (0.785 mm2), tumour buds were manually annotated, and the output of the T-cell algorithm manually corrected by a single observer. For budding, 645 out of the 1'306 buds were correctly identified by the algorithm. Recall and precision were 49.4% and 61.4%, respectively. For the T-cells, 89.3% were correctly detected (from a total of 16'296). The recall was 90.3% and the precision was 87.3%. Reasons for misclassified T-cells included staining intensity, suboptimal tissue recognition and slide artifacts.
-
-                          Conclusion: Our preliminary data demonstrates satisfactory results for T-cell detection. Automated budding detection is more difficult, as inter-observer variability of bud calling is high among experts. These issues merit consideration when developing reliable deep learning algorithms examining the tumour/host interface.},
+  
+                            Methods: Randomly selected (n=61) double-stained immunohistochemical slides (AE1-AE3 pancytokeratin for tumour buds and CD8 for cytotoxic T-cells) from our pT1 cohort from 3 different institutions were used to validate the deep learning algorithms for tumour budding and CD8 T-cell detection developed by the International Budding Consortium Computational Pathology Group. Staining and scanning were performed in a single laboratory.
+  
+                            Results: In the visually identified tumour budding hotspot (0.785 mm2), tumour buds were manually annotated, and the output of the T-cell algorithm manually corrected by a single observer. For budding, 645 out of the 1'306 buds were correctly identified by the algorithm. Recall and precision were 49.4% and 61.4%, respectively. For the T-cells, 89.3% were correctly detected (from a total of 16'296). The recall was 90.3% and the precision was 87.3%. Reasons for misclassified T-cells included staining intensity, suboptimal tissue recognition and slide artifacts.
+  
+                            Conclusion: Our preliminary data demonstrates satisfactory results for T-cell detection. Automated budding detection is more difficult, as inter-observer variability of bud calling is high among experts. These issues merit consideration when developing reliable deep learning algorithms examining the tumour/host interface.},
 }
 
 @inproceedings{Stud22,
@@ -30921,16 +30954,16 @@ @inproceedings{Stud22
   optnote = {DIAG, RADIOLOGY},
   file = {Stud22.pdf:pdf/Stud22.pdf:PDF},
   abstract = {Introduction
-                          As pT1 colorectal cancers (CRC) tend to be overtreated, we investigate the previously proposed BTS (budding-T-cell-score = (#tumor-buds+1)/(#T-cells+1)) as a predictive marker to assess patients' need for resection. BTS was shown to be a better predictor of survival and other clinical factors than individual scoring.
-
-                          Materials and Methods
-                          We consider hotspots annotated by a pathologist according to the ITBCC guidelines on double-stained (AE1-AE3 pan-cytokeratin and CD8+) WSI from our pT1 CRC cohort (N=573). Within hotspots, tumor-buds and T-cells are automatically detected using convolutional neural networks and counted. The patients are divided into two groups based on their need for resection (no: N0 / follow-up without recurrence; yes: N1 / follow-up with recurrence). The dataset is imbalanced (89.2%/10.8%). To predict the patient group, we train a support-vector machine with data-balancing using the tumor-buds or T-cell counts individually, together, and just the BTS. We report the weighted accuracy, and sensitivity and specificity for the "yes" group.
-
-                          Results
-                          The highest weighted accuracy (62.8Tu 6.5%) and precision (17.6Tu 3.7%) are achieved using the tumor-buds count. Using the BTS achieves a sensitivity of 98.3Tu 2.9%, which outperforms the other models by more than 30%.
-
-                          Conclusion
-                          We show that combined assessment of tumor-buds and T-cells has the potential to serve as a predictive marker for the need of resection in pT1 cancers. However, there is still much room for improvement, as the low specificity still leads to overtreatment. We aim to address this in future work by also considering the spatial relationship of tumor-buds and T-cells and other predictive factors of nodal metastasis.},
+                            As pT1 colorectal cancers (CRC) tend to be overtreated, we investigate the previously proposed BTS (budding-T-cell-score = (#tumor-buds+1)/(#T-cells+1)) as a predictive marker to assess patients' need for resection. BTS was shown to be a better predictor of survival and other clinical factors than individual scoring.
+  
+                            Materials and Methods
+                            We consider hotspots annotated by a pathologist according to the ITBCC guidelines on double-stained (AE1-AE3 pan-cytokeratin and CD8+) WSI from our pT1 CRC cohort (N=573). Within hotspots, tumor-buds and T-cells are automatically detected using convolutional neural networks and counted. The patients are divided into two groups based on their need for resection (no: N0 / follow-up without recurrence; yes: N1 / follow-up with recurrence). The dataset is imbalanced (89.2%/10.8%). To predict the patient group, we train a support-vector machine with data-balancing using the tumor-buds or T-cell counts individually, together, and just the BTS. We report the weighted accuracy, and sensitivity and specificity for the "yes" group.
+  
+                            Results
+                            The highest weighted accuracy (62.8Tu 6.5%) and precision (17.6Tu 3.7%) are achieved using the tumor-buds count. Using the BTS achieves a sensitivity of 98.3Tu 2.9%, which outperforms the other models by more than 30%.
+  
+                            Conclusion
+                            We show that combined assessment of tumor-buds and T-cells has the potential to serve as a predictive marker for the need of resection in pT1 cancers. However, there is still much room for improvement, as the low specificity still leads to overtreatment. We aim to address this in future work by also considering the spatial relationship of tumor-buds and T-cells and other predictive factors of nodal metastasis.},
 }
 
 @article{Stur19,
@@ -31068,7 +31101,7 @@ @inproceedings{Swid20
   doi = {10.1117/12.2549650},
   series = {SPIE},
   abstract = {Diffuse large B-cell lymphoma (DLBCL) is the most common type of B-cell lymphoma. It is characterized by a heterogeneous morphology, genetic changes and clinical behavior. A small specific subgroup of DLBCL, harbouring a MYC gene translocation is associated with worse patient prognosis and outcome. Typically, the MYC translocation is assessed with a molecular test (FISH), that is expensive and time-consuming. Our hypothesis is that genetic changes, such as translocations could be visible as changes in the morphology of an HE-stained specimen. However, it has not proven possible to use morphological criteria for the detection of a MYC translocation in the diagnostic setting due to lack of specificity.
-                             In this paper, we apply a deep learning model to automate detection of the MYC translocations in DLBCL based on HE-stained specimens. The proposed method works at the whole-slide level and was developed based on a multicenter data cohort of 91 patients. All specimens were stained with HE, and the MYC translocation was confirmed using fluorescence in situ hybridization (FISH). The system was evaluated on an additional 66 patients, and obtained AUROC of 0.83 and accuracy of 0.77. The proposed method presents proof of a concept giving insights in the applicability of deep learning methods for detection of a genetic changes in DLBCL. In future work we will evaluate our algorithm for automatic pre-screen of DLBCL specimens to obviate FISH analysis in a large number of patients.},
+                               In this paper, we apply a deep learning model to automate detection of the MYC translocations in DLBCL based on HE-stained specimens. The proposed method works at the whole-slide level and was developed based on a multicenter data cohort of 91 patients. All specimens were stained with HE, and the MYC translocation was confirmed using fluorescence in situ hybridization (FISH). The system was evaluated on an additional 66 patients, and obtained AUROC of 0.83 and accuracy of 0.77. The proposed method presents proof of a concept giving insights in the applicability of deep learning methods for detection of a genetic changes in DLBCL. In future work we will evaluate our algorithm for automatic pre-screen of DLBCL specimens to obviate FISH analysis in a large number of patients.},
   file = {:pdf/Swid20.pdf:PDF},
   optnote = {DIAG},
   year = {2020},
@@ -31408,29 +31441,29 @@ @article{Tan16
   doi = {10.1118/1.4953206},
   url = {http://scitation.aip.org/content/aapm/journal/medphys/43/7/10.1118/1.4953206},
   abstract = {Purpose: Automated 3D breast ultrasound (ABUS) has been proposed as a complementary screening
-                                                       modality to mammography for early detection of breast cancers. To facilitate the interpretation
-                                                       of ABUS images, automated diagnosis and detection techniques are being developed, in which
-                                                       malignant lesion segmentation plays an important role. However, automated segmentation of cancer
-                                                       in ABUS is challenging since lesion edges might not be well defined. In this study, the authors aim
-                                                       at developing an automated segmentation method for malignant lesions in ABUS that is robust to
-                                                       ill-defined cancer edges and posterior shadowing.
-
-                                                       Methods: A segmentation method using depth-guided dynamic programming based on spiral scanning
-                                                       is proposed. The method automatically adjusts aggressiveness of the segmentation according
-                                                       to the position of the voxels relative to the lesion center. Segmentation is more aggressive in the
-                                                       upper part of the lesion (close to the transducer) than at the bottom (far away from the transducer),
-                                                       where posterior shadowing is usually visible. The authors used Dice similarity coefficient (Dice)
-                                                       for evaluation. The proposed method is compared to existing state of the art approaches such as
-                                                       graph cut, level set, and smart opening and an existing dynamic programming method without depth
-                                                       dependence.
-
-                                                       Results: In a dataset of 78 cancers, our proposed segmentation method achieved a mean Dice of
-                                                       0.73+-0.14. The method outperforms an existing dynamic programming method (0.70+-0.16) on this
-                                                       task (p = 0.03) and it is also significantly (p < 0.001) better than graph cut (0.66+-0.18), level set
-                                                       based approach (0.63+-0.20) and smart opening (0.65+-0.12).
-
-                                                       Conclusions: The proposed depth-guided dynamic programming method achieves accurate breast
-                                                       malignant lesion segmentation results in automated breast ultrasound.},
+                                                         modality to mammography for early detection of breast cancers. To facilitate the interpretation
+                                                         of ABUS images, automated diagnosis and detection techniques are being developed, in which
+                                                         malignant lesion segmentation plays an important role. However, automated segmentation of cancer
+                                                         in ABUS is challenging since lesion edges might not be well defined. In this study, the authors aim
+                                                         at developing an automated segmentation method for malignant lesions in ABUS that is robust to
+                                                         ill-defined cancer edges and posterior shadowing.
+  
+                                                         Methods: A segmentation method using depth-guided dynamic programming based on spiral scanning
+                                                         is proposed. The method automatically adjusts aggressiveness of the segmentation according
+                                                         to the position of the voxels relative to the lesion center. Segmentation is more aggressive in the
+                                                         upper part of the lesion (close to the transducer) than at the bottom (far away from the transducer),
+                                                         where posterior shadowing is usually visible. The authors used Dice similarity coefficient (Dice)
+                                                         for evaluation. The proposed method is compared to existing state of the art approaches such as
+                                                         graph cut, level set, and smart opening and an existing dynamic programming method without depth
+                                                         dependence.
+  
+                                                         Results: In a dataset of 78 cancers, our proposed segmentation method achieved a mean Dice of
+                                                         0.73+-0.14. The method outperforms an existing dynamic programming method (0.70+-0.16) on this
+                                                         task (p = 0.03) and it is also significantly (p < 0.001) better than graph cut (0.66+-0.18), level set
+                                                         based approach (0.63+-0.20) and smart opening (0.65+-0.12).
+  
+                                                         Conclusions: The proposed depth-guided dynamic programming method achieves accurate breast
+                                                         malignant lesion segmentation results in automated breast ultrasound.},
   file = {Tan16.pdf:pdf\\Tan16.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {27370126},
@@ -31584,9 +31617,9 @@ @phdthesis{Tell21a
   title = {Advancing computational pathology with deep learning: from patches to gigapixel image-level classification},
   url = {https://repository.ubn.ru.nl/handle/2066/233752},
   abstract = {The main focus of this work is to investigate novel deep learning based methodologies to improve breast cancer prognostic tools within the context of Computational Pathology. This research can be divided into three key blocks:
-                             1. Fundamental challenges in Computational Pathology. We address some of the issues that arise when developing deep learning based models across applications and organs. First, scaling the generation of pixel-level annotated data (Chapter 2). Second, addressing intra- and inter-center stain variation (Chapters 2 and 3). Third, developing accurate and fast models to process entire whole-slide images (Chapters 2 and 4).
-                             2. Automating the core component of breast cancer grading: performing mitosis detection at scale, that is, processing thousands of unseen multicenter entire whole-slide images, while deriving actionable insights for pathologists (Chapter 2).
-                             3. Performing whole-slide image classification. We propose a method that enables feeding entire whole-slide images to a single deep learning based model , targeting patient-level labels and outcome data such as overall survival(Chapters 4 and 5).},
+                               1. Fundamental challenges in Computational Pathology. We address some of the issues that arise when developing deep learning based models across applications and organs. First, scaling the generation of pixel-level annotated data (Chapter 2). Second, addressing intra- and inter-center stain variation (Chapters 2 and 3). Third, developing accurate and fast models to process entire whole-slide images (Chapters 2 and 4).
+                               2. Automating the core component of breast cancer grading: performing mitosis detection at scale, that is, processing thousands of unseen multicenter entire whole-slide images, while deriving actionable insights for pathologists (Chapter 2).
+                               3. Performing whole-slide image classification. We propose a method that enables feeding entire whole-slide images to a single deep learning based model , targeting patient-level labels and outcome data such as overall survival(Chapters 4 and 5).},
   copromotor = {F. Ciompi and G. Litjens},
   file = {Tell21a.pdf:pdf\\Tell21a.pdf:PDF},
   optnote = {DIAG},
@@ -31621,20 +31654,20 @@ @article{Terh21
   url = {http://dx.doi.org/10.1186/s12874-021-01243-8},
   volume = {21},
   abstract = {Abstract
-                                         Background
-                                         Recruiting asymptomatic participants with early disease stages into studies is challenging and only little is known about facilitators and barriers to screening and recruitment of study participants. Thus we assessed factors associated with screening rates in the MACUSTAR study, a multi-centre, low-interventional cohort study of early stages of age-related macular degeneration (AMD).
-
-                                         Methods
-                                         Screening rates per clinical site and per week were compiled and applicable recruitment factors were assigned to respective time periods. A generalized linear mixed-effects model including the most relevant recruitment factors identified via in-depth interviews with study personnel was fitted to the screening data. Only participants with intermediate AMD were considered.
-
-                                         Results
-                                         A total of 766 individual screenings within 87 weeks were available for analysis. The mean screening rate was 0.6 +- 0.9 screenings per week among all sites. The participation at investigator teleconferences (relative risk increase 1.466, 95% CI [1.018-2.112]), public holidays (relative risk decrease 0.466, 95% CI [0.367-0.591]) and reaching 80% of the site's recruitment target (relative risk decrease 0.699, 95% CI [0.367-0.591]) were associated with the number of screenings at an individual site level.
-
-                                         Conclusions
-                                         Careful planning of screening activities is necessary when recruiting early disease stages in multi-centre observational or low-interventional studies. Conducting teleconferences with local investigators can increase screening rates. When planning recruitment, seasonal and saturation effects at clinical site level need to be taken into account.
-
-                                         Trial registration
-                                         <jats:ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://clinicaltrials.gov">ClinicalTrials.gov</jats:ext-link><jats:ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="https://clinicaltrials.gov/ct2/show/NCT03349801">NCT03349801</jats:ext-link>. Registered on 22 November 2017.},
+                                           Background
+                                           Recruiting asymptomatic participants with early disease stages into studies is challenging and only little is known about facilitators and barriers to screening and recruitment of study participants. Thus we assessed factors associated with screening rates in the MACUSTAR study, a multi-centre, low-interventional cohort study of early stages of age-related macular degeneration (AMD).
+  
+                                           Methods
+                                           Screening rates per clinical site and per week were compiled and applicable recruitment factors were assigned to respective time periods. A generalized linear mixed-effects model including the most relevant recruitment factors identified via in-depth interviews with study personnel was fitted to the screening data. Only participants with intermediate AMD were considered.
+  
+                                           Results
+                                           A total of 766 individual screenings within 87 weeks were available for analysis. The mean screening rate was 0.6 +- 0.9 screenings per week among all sites. The participation at investigator teleconferences (relative risk increase 1.466, 95% CI [1.018-2.112]), public holidays (relative risk decrease 0.466, 95% CI [0.367-0.591]) and reaching 80% of the site's recruitment target (relative risk decrease 0.699, 95% CI [0.367-0.591]) were associated with the number of screenings at an individual site level.
+  
+                                           Conclusions
+                                           Careful planning of screening activities is necessary when recruiting early disease stages in multi-centre observational or low-interventional studies. Conducting teleconferences with local investigators can increase screening rates. When planning recruitment, seasonal and saturation effects at clinical site level need to be taken into account.
+  
+                                           Trial registration
+                                           <jats:ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://clinicaltrials.gov">ClinicalTrials.gov</jats:ext-link><jats:ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="https://clinicaltrials.gov/ct2/show/NCT03349801">NCT03349801</jats:ext-link>. Registered on 22 November 2017.},
   all_ss_ids = {[32af51ced47419cff26fde66cce602fbab2f238a]},
   automatic = {yes},
   citation-count = {4},
@@ -31726,34 +31759,34 @@ @conference{Teuw17b
   booktitle = RSNA,
   year = {2017},
   abstract = {PURPOSE
-                                                       In this study we evaluated the potential of a computer system to select exams with low likelihood of
-                                                       containing cancer.
-
-                                                       METHOD AND MATERIALS
-                                                       We collected a representative set of 1649 referrals with different screening outcome from the Dutch
-                                                       breast cancer screening. The dataset comprised 489 true positives (TP) exams and 1160 false
-                                                       positive (FP) exams. In addition, we collected 1000 true negative (TN) exams from the same
-                                                       screening population. All exams were automatically analyzed with Transpara v1.2.0 (ScreenPoint
-                                                       Medical, Nijmegen, The Netherlands). Transpara uses deep learning algorithms to, based on
-                                                       soft-tissue lesions and calcifications findings, categorize every mammogram on a 10-point scale. This
-                                                       computerized score represents the likelihood that a cancer is present in the exam at hand, where 10
-                                                       represents the highest likelihood that a cancer is present. It is defined in such a way that, in a
-                                                       screening setting, the number of mammograms in each category is roughly equal.
-
-                                                       In this study, we determined the distribution of the computerized cancer likelihood scores for the TP,
-                                                       FP and TN exams. In particular we quantified for each category the fraction of cases with a cancer
-                                                       likelihood score below or equal to 5, including about 50% of the mammograms. Additionally we
-                                                       evaluated the positive predictive value (PPV) of referrals in each likelihood category.
-
-                                                       RESULTS
-                                                       5.11% of the TPs, 20.3% of the FPs and 45.0% of the TNs were assigned to the likelihood categories
-                                                       1 to 5. This corresponds to 0.7 cancers per 1000 in the group with score 1-5 and 11.2 per 1000 with a
-                                                       score higher than 5, based on the cancer detection rate of 6.5/1000 in the Dutch screening program.
-                                                       The PPV was 8.00%, 8.14%, and 44.9% for cancer likelihood scores 1, 5 and 10, respectively.
-
-                                                       CONCLUSION
-                                                       Automated identification of a fraction of screening mammograms that most likely are normal is
-                                                       feasible.},
+                                                         In this study we evaluated the potential of a computer system to select exams with low likelihood of
+                                                         containing cancer.
+  
+                                                         METHOD AND MATERIALS
+                                                         We collected a representative set of 1649 referrals with different screening outcome from the Dutch
+                                                         breast cancer screening. The dataset comprised 489 true positives (TP) exams and 1160 false
+                                                         positive (FP) exams. In addition, we collected 1000 true negative (TN) exams from the same
+                                                         screening population. All exams were automatically analyzed with Transpara v1.2.0 (ScreenPoint
+                                                         Medical, Nijmegen, The Netherlands). Transpara uses deep learning algorithms to, based on
+                                                         soft-tissue lesions and calcifications findings, categorize every mammogram on a 10-point scale. This
+                                                         computerized score represents the likelihood that a cancer is present in the exam at hand, where 10
+                                                         represents the highest likelihood that a cancer is present. It is defined in such a way that, in a
+                                                         screening setting, the number of mammograms in each category is roughly equal.
+  
+                                                         In this study, we determined the distribution of the computerized cancer likelihood scores for the TP,
+                                                         FP and TN exams. In particular we quantified for each category the fraction of cases with a cancer
+                                                         likelihood score below or equal to 5, including about 50% of the mammograms. Additionally we
+                                                         evaluated the positive predictive value (PPV) of referrals in each likelihood category.
+  
+                                                         RESULTS
+                                                         5.11% of the TPs, 20.3% of the FPs and 45.0% of the TNs were assigned to the likelihood categories
+                                                         1 to 5. This corresponds to 0.7 cancers per 1000 in the group with score 1-5 and 11.2 per 1000 with a
+                                                         score higher than 5, based on the cancer detection rate of 6.5/1000 in the Dutch screening program.
+                                                         The PPV was 8.00%, 8.14%, and 44.9% for cancer likelihood scores 1, 5 and 10, respectively.
+  
+                                                         CONCLUSION
+                                                         Automated identification of a fraction of screening mammograms that most likely are normal is
+                                                         feasible.},
   optnote = {DIAG},
 }
 
@@ -32041,17 +32074,17 @@ @article{Tura21
   url = {http://dx.doi.org/10.1007/s00261-021-03207-4},
   volume = {47},
   abstract = {Abstract
-                                         Objectives
-                                         Over 2500 percutaneous transhepatic cholangiography and biliary drainage (PTCD) procedures are yearly performed in the Netherlands. Most interventions are performed for treatment of biliary obstruction following unsuccessful endoscopic biliary cannulation. Our aim was to evaluate complication rates and risk factors for complications in PTCD patients after failed ERCP.
-
-                                         Methods
-                                         We performed an observational study collecting data from a cohort that was subjected to PTCD during a 5-year period in one academic and four teaching hospitals. Primary objective was the development of infectious (sepsis, cholangitis, abscess, or cholecystitis) and non-infectious complications (bile leakage, severe hemorrhage, etc.) and mortality within 30 days of the procedure. Subsequently, risk factors for complications and mortality were analyzed with a multilevel logistic regression analysis.
-
-                                         Results
-                                         A total of 331 patients underwent PTCD of whom 205 (61.9%) developed PTCD-related complications. Of the 224 patients without a pre-existent infection, 91 (40.6%) developed infectious complications, i.e., cholangitis in 26.3%, sepsis in 24.6%, abscess formation in 2.7%, and cholecystitis in 1.3%. Non-infectious complications developed in 114 of 331 patients (34.4%). 30-day mortality was 17.2% (N = 57). Risk factors for infectious complications included internal drainage and drain obstruction, while multiple re-interventions were a risk factor for non-infectious complications.
-
-                                         Conclusion
-                                         Both infectious and non-infectious complications are frequent after PTCD, most often due to biliary drain obstruction.},
+                                           Objectives
+                                           Over 2500 percutaneous transhepatic cholangiography and biliary drainage (PTCD) procedures are yearly performed in the Netherlands. Most interventions are performed for treatment of biliary obstruction following unsuccessful endoscopic biliary cannulation. Our aim was to evaluate complication rates and risk factors for complications in PTCD patients after failed ERCP.
+  
+                                           Methods
+                                           We performed an observational study collecting data from a cohort that was subjected to PTCD during a 5-year period in one academic and four teaching hospitals. Primary objective was the development of infectious (sepsis, cholangitis, abscess, or cholecystitis) and non-infectious complications (bile leakage, severe hemorrhage, etc.) and mortality within 30 days of the procedure. Subsequently, risk factors for complications and mortality were analyzed with a multilevel logistic regression analysis.
+  
+                                           Results
+                                           A total of 331 patients underwent PTCD of whom 205 (61.9%) developed PTCD-related complications. Of the 224 patients without a pre-existent infection, 91 (40.6%) developed infectious complications, i.e., cholangitis in 26.3%, sepsis in 24.6%, abscess formation in 2.7%, and cholecystitis in 1.3%. Non-infectious complications developed in 114 of 331 patients (34.4%). 30-day mortality was 17.2% (N = 57). Risk factors for infectious complications included internal drainage and drain obstruction, while multiple re-interventions were a risk factor for non-infectious complications.
+  
+                                           Conclusion
+                                           Both infectious and non-infectious complications are frequent after PTCD, most often due to biliary drain obstruction.},
   all_ss_ids = {[b03e1e450d04641eb219abf19f7a33f99834ce95]},
   automatic = {yes},
   citation-count = {9},
@@ -32253,7 +32286,7 @@ @mastersthesis{Vala19
   author = {Valacchi, Lorenzo},
   title = {Analysis and endotracheal tube detection in chest x-rays using deep learning},
   abstract = {The following work is focusing on the development of two deep learning models applied to chest x-rays. The first model, Imagesorter, provides a solution for sorting chest x-ray images where metadata is not available or is unreliable. This is frequently the case when accessing large collection of radiographs and can result in very time consuming procedures to obtain reliable data. Specifically, the algorithm returns four properties of the image: the type of image presented, rotation (wheather the image is rotated), inversion (whether the grayscale level of the radiograph inverted) and orientation (whether a lateral chest x-ray is mirrored). Nearly 30,000 radiographs were gathered and used to train, validate and test a deep convolutional neural network. For the purpose, a ResNet50 network pretrained on ImageNet and finetuned on the chest x-ray dataset was used. Moreover, the network architechture was modified to return all the four features at once. The model achieved very good results over the test set and can be consider a valid tool to efficiently explore and sort large x-ray collections. The second model, Endotracheal-Tube, detect the presence of an endotracheal tube in a chest x-ray. Many automated methods require to gather chest x-rays where an endotracheal tube is present. The presented algorithm can help gather reliable data from large collection in a short amount of time. A large dataset was created for the project and a preprocessing method to crop a square area of the image where the tube lays is presented. Four models are trained, validated and tested over the same dataset to assess the best. At the end an InceptionV3 network pretrained on ImageNet and finetuned on the dataset achieved the best results (AUC = 0.993).
-                              Both projects are part of OpenCXR, an open source library developed by the Chest X-Ray teams at the Dignostic Image Analysis Group at the Radboud University Medical Center, Nijmegen, The Netherlands.},
+                                Both projects are part of OpenCXR, an open source library developed by the Chest X-Ray teams at the Dignostic Image Analysis Group at the Radboud University Medical Center, Nijmegen, The Netherlands.},
   file = {Vala19.pdf:pdf\\Vala19.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   school = {Universita di Siena},
@@ -32266,16 +32299,16 @@ @conference{Valk19
   booktitle = ARVO,
   title = {Familial discordance in disease phenotype in siblings with Stargardt disease},
   abstract = {Purpose:
-                                                       To investigate intersibling discordance of the Stargardt disease (STGD1) phenotype.
-
-                                                       Methods:
-                                                       We performed a retrospective cohort study among siblings with genetically confirmed STGD1 and at least one available fundus autofluorescence (FAF) image of both eyes. We compared age of onset within families using the youngest patient as the reference and a predetermined threshold value of 10 years for significant differences. Disease duration was matched to investigate differences in best-corrected visual acuity, and we determined and compared the survival time for reaching severe visual impairment (SVI); (<20/200 Snellen or > 1.3 Logarithm of the Minimal Angle of Resolution (LogMAR)). Central retinal atrophy surface area was quantified and compared by two independent graders using the semi-automated EyeNED software. Additionally, both graders performed qualitative assessment of FAF patterns to identify phenotypic differences and commonalities. Main outcome measures included differences in age of onset, best-corrected visual acuity (BCVA), time to develop legal blindness, FAF atrophy surface area and autofluorescence patterns.
-
-                                                       Results:
-                                                       Significant differences in age of onset were present in 5/17 families, ranging from 13 to 39 years. BCVA was matched in 12/17 families and the median difference was 0.41 (0 - 1.10) LogMAR for the right and 0.41 (0 - 1.08) LogMAR for the left eye, and we found extreme differences in five families ranging from 0.58 to 1.1 LogMAR. The median age at which patients developed SVI was 14 years. We observed significant differences in time to develop SVI in three out of 12 families with matched survival times, ranging from 14 to 29 years. Median central retinal atrophy surface area was 11.38 mm2 in the right (range 1.98 - 44.78 mm2) and 10.59 mm2 in the left (range 1.61 - 40.59 mm2) eyes and was highly comparable between siblings, with the exception of family one. Qualitative FAF phenotypes were comparable in all sibling pairs.
-
-                                                       Conclusions:
-                                                       Phenotypic discordance between siblings with STGD1 disease carrying the same ABCA4 variants is a prevalent phenomenon. Functional outcomes can differ substantially despite highly comparable FAF phenotypes, which complicates sibling-based prognosis. While environmental factor are likely to modify the disease course, the relatively young median age at which patients develop SVI indicates an important role for genetic factors as disease modifiers.},
+                                                         To investigate intersibling discordance of the Stargardt disease (STGD1) phenotype.
+  
+                                                         Methods:
+                                                         We performed a retrospective cohort study among siblings with genetically confirmed STGD1 and at least one available fundus autofluorescence (FAF) image of both eyes. We compared age of onset within families using the youngest patient as the reference and a predetermined threshold value of 10 years for significant differences. Disease duration was matched to investigate differences in best-corrected visual acuity, and we determined and compared the survival time for reaching severe visual impairment (SVI); (<20/200 Snellen or > 1.3 Logarithm of the Minimal Angle of Resolution (LogMAR)). Central retinal atrophy surface area was quantified and compared by two independent graders using the semi-automated EyeNED software. Additionally, both graders performed qualitative assessment of FAF patterns to identify phenotypic differences and commonalities. Main outcome measures included differences in age of onset, best-corrected visual acuity (BCVA), time to develop legal blindness, FAF atrophy surface area and autofluorescence patterns.
+  
+                                                         Results:
+                                                         Significant differences in age of onset were present in 5/17 families, ranging from 13 to 39 years. BCVA was matched in 12/17 families and the median difference was 0.41 (0 - 1.10) LogMAR for the right and 0.41 (0 - 1.08) LogMAR for the left eye, and we found extreme differences in five families ranging from 0.58 to 1.1 LogMAR. The median age at which patients developed SVI was 14 years. We observed significant differences in time to develop SVI in three out of 12 families with matched survival times, ranging from 14 to 29 years. Median central retinal atrophy surface area was 11.38 mm2 in the right (range 1.98 - 44.78 mm2) and 10.59 mm2 in the left (range 1.61 - 40.59 mm2) eyes and was highly comparable between siblings, with the exception of family one. Qualitative FAF phenotypes were comparable in all sibling pairs.
+  
+                                                         Conclusions:
+                                                         Phenotypic discordance between siblings with STGD1 disease carrying the same ABCA4 variants is a prevalent phenomenon. Functional outcomes can differ substantially despite highly comparable FAF phenotypes, which complicates sibling-based prognosis. While environmental factor are likely to modify the disease course, the relatively young median age at which patients develop SVI indicates an important role for genetic factors as disease modifiers.},
   optnote = {DIAG, RADIOLOGY},
   year = {2019},
 }
@@ -32292,25 +32325,25 @@ @article{Valk19a
   doi = {10.1016/j.ophtha.2019.07.010},
   url = {https://www.sciencedirect.com/science/article/pii/S0161642019306578?via%3Dihub},
   abstract = {Purpose
-                                                       To investigate intersibling phenotypic concordance in Stargardt disease (STGD1).
-
-                                                       Design
-                                                       Retrospective cohort study.
-
-                                                       Participants
-                                                       Siblings with genetically confirmed STGD1 and at least 1 available fundus autofluorescence (FAF) image of both eyes.
-
-                                                       Methods
-                                                       We compared age at onset within families. Disease duration was matched to investigate differences in best-corrected visual acuity (BCVA) and compared the survival time for reaching severe visual impairment (<20/200 Snellen or >1.0 logarithm of the minimum angle of resolution [logMAR]). Central retinal atrophy area was quantified independently by 2 experienced graders using semiautomated software and compared between siblings. Both graders performed qualitative assessment of FAF and spectral-domain (SD) OCT images to identify phenotypic differences.
-
-                                                       Main Outcome Measures
-                                                       Differences in age at onset, disease duration-matched BCVA, time to severe visual impairment development, FAF atrophy area, FAF patterns, and genotypes.
-
-                                                       Results
-                                                       Substantial differences in age at onset were present in 5 of 17 families, ranging from 13 to 39 years. Median BCVA at baseline was 0.60 logMAR (range, -0.20 to 2.30 logMAR; Snellen equivalent, 20/80 [range, 20/12-hand movements]) in the right eye and 0.50 logMAR (range, -0.20 to 2.30 logMAR; Snellen equivalent, 20/63 [range, 20/12-hand movements]) in the left eye. Disease duration-matched BCVA was investigated in 12 of 17 families, and the median difference was 0.41 logMAR (range, 0.00-1.10 logMAR) for the right eye and 0.41 logMAR (range, 0.00-1.08 logMAR) for the left eye. We observed notable differences in time to severe visual impairment development in 7 families, ranging from 1 to 29 years. Median central retinal atrophy area was 11.38 mm2 in the right eye (range, 1.98-44.78 mm2) and 10.59 mm2 in the left eye (range, 1.61-40.59 mm2) and highly comparable between siblings. Similarly, qualitative FAF and SD OCT phenotypes were highly comparable between siblings.
-
-                                                       Conclusions
-                                                       Phenotypic discordance between siblings with STGD1 carrying the same ABCA4 variants is a prevalent phenomenon. Although the FAF phenotypes are highly comparable between siblings, functional outcomes differ substantially. This complicates both sibling-based prognosis and genotype-phenotype correlations and has important implications for patient care and management.},
+                                                         To investigate intersibling phenotypic concordance in Stargardt disease (STGD1).
+  
+                                                         Design
+                                                         Retrospective cohort study.
+  
+                                                         Participants
+                                                         Siblings with genetically confirmed STGD1 and at least 1 available fundus autofluorescence (FAF) image of both eyes.
+  
+                                                         Methods
+                                                         We compared age at onset within families. Disease duration was matched to investigate differences in best-corrected visual acuity (BCVA) and compared the survival time for reaching severe visual impairment (<20/200 Snellen or >1.0 logarithm of the minimum angle of resolution [logMAR]). Central retinal atrophy area was quantified independently by 2 experienced graders using semiautomated software and compared between siblings. Both graders performed qualitative assessment of FAF and spectral-domain (SD) OCT images to identify phenotypic differences.
+  
+                                                         Main Outcome Measures
+                                                         Differences in age at onset, disease duration-matched BCVA, time to severe visual impairment development, FAF atrophy area, FAF patterns, and genotypes.
+  
+                                                         Results
+                                                         Substantial differences in age at onset were present in 5 of 17 families, ranging from 13 to 39 years. Median BCVA at baseline was 0.60 logMAR (range, -0.20 to 2.30 logMAR; Snellen equivalent, 20/80 [range, 20/12-hand movements]) in the right eye and 0.50 logMAR (range, -0.20 to 2.30 logMAR; Snellen equivalent, 20/63 [range, 20/12-hand movements]) in the left eye. Disease duration-matched BCVA was investigated in 12 of 17 families, and the median difference was 0.41 logMAR (range, 0.00-1.10 logMAR) for the right eye and 0.41 logMAR (range, 0.00-1.08 logMAR) for the left eye. We observed notable differences in time to severe visual impairment development in 7 families, ranging from 1 to 29 years. Median central retinal atrophy area was 11.38 mm2 in the right eye (range, 1.98-44.78 mm2) and 10.59 mm2 in the left eye (range, 1.61-40.59 mm2) and highly comparable between siblings. Similarly, qualitative FAF and SD OCT phenotypes were highly comparable between siblings.
+  
+                                                         Conclusions
+                                                         Phenotypic discordance between siblings with STGD1 carrying the same ABCA4 variants is a prevalent phenomenon. Although the FAF phenotypes are highly comparable between siblings, functional outcomes differ substantially. This complicates both sibling-based prognosis and genotype-phenotype correlations and has important implications for patient care and management.},
   file = {Valk19a.pdf:pdf\\Valk19a.pdf:PDF},
   optnote = {DIAG},
   pmid = {31522899},
@@ -32896,16 +32929,16 @@ @article{Ven16a
   doi = {10.1016/j.clinimag.2016.02.005},
   url = {http://dx.doi.org/10.1016/j.clinimag.2016.02.005},
   abstract = {Objectives
-                                                       To determine TRUS visibility of MR lesions.
-
-                                                       Methods
-                                                       Data from 34 patients with 56 MR lesions and prostatectomy was used. Five observers localized and determined TRUS visibility during retrospective fusion. Visibility was correlated to PIRADS and Gleason scores.
-
-                                                       Results
-                                                       TRUS visibility occurred in 43% of all MR lesions and 62% of PIRADS 5 lesions. Visible lesions had a significantly lower localization variability. On prostatectomy, 58% of the TRUS visible lesions had a Gleason 4 or 5 component.
-
-                                                       Conclusions
-                                                       Almost half of the MR lesions were visible on TRUS. TRUS visible lesions were more aggressive than TRUS invisible lesions.},
+                                                         To determine TRUS visibility of MR lesions.
+  
+                                                         Methods
+                                                         Data from 34 patients with 56 MR lesions and prostatectomy was used. Five observers localized and determined TRUS visibility during retrospective fusion. Visibility was correlated to PIRADS and Gleason scores.
+  
+                                                         Results
+                                                         TRUS visibility occurred in 43% of all MR lesions and 62% of PIRADS 5 lesions. Visible lesions had a significantly lower localization variability. On prostatectomy, 58% of the TRUS visible lesions had a Gleason 4 or 5 component.
+  
+                                                         Conclusions
+                                                         Almost half of the MR lesions were visible on TRUS. TRUS visible lesions were more aggressive than TRUS invisible lesions.},
   file = {Ven16a.pdf:pdf\\Ven16a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {25979040},
@@ -32943,12 +32976,12 @@ @article{Vend17c
   pages = {1849-1855},
   doi = {10.1007/s00345-017-2085-6},
   abstract = {PURPOSE: To compare clinically significant prostate cancer (csPCa) detection rates between magnetic resonance imaging (MRI)-transrectal ultrasound (TRUS) fusion-guided prostate biopsy (FGB) and direct in-bore MRI-guided biopsy (MRGB).
-                                                       METHODS:
-                                                       We performed a comparison of csPCa detection rates between FGB and MRGB. Included patients had (1) at least one prior negative TRUS biopsy; (2) a Prostate Imaging Reporting and Data System (PI-RADS) 4 or 5 lesion and (3) a lesion size of >=8 mm measured in at least one direction. We considered a Gleason score >=7 being csPCa. Descriptive statistics with 95% confidence intervals (CI) were used to determine any differences.
-                                                       RESULTS:
-                                                       We included 51 patients with FGB (59 PI-RADS 4 and 41% PI-RADS 5) and 227 patients with MRGB (34 PI-RADS 4 and 66% PI-RADS 5). Included patients had a median age of 69 years (IQR, 65-72) and a median PSA level of 11.0 ng/ml (IQR, 7.4-15.1) and a median age of 67 years (IQR, 61-70), the median PSA 12.8 ng/ml (IQR, 9.1-19.0) within the FGB and the MRGB group, respectively. Detection rates of csPCA did not differ significantly between FGB and MRGB, 49 vs. 61%, respectively.
-                                                       CONCLUSION:
-                                                       We did not detect significant differences between FGB and MRGB in the detection of csPCa. The differences in detection ratios between both biopsy techniques are narrow with an increasing lesion size. This study warrants further studies to optimize selection of best biopsy modality.},
+                                                         METHODS:
+                                                         We performed a comparison of csPCa detection rates between FGB and MRGB. Included patients had (1) at least one prior negative TRUS biopsy; (2) a Prostate Imaging Reporting and Data System (PI-RADS) 4 or 5 lesion and (3) a lesion size of >=8 mm measured in at least one direction. We considered a Gleason score >=7 being csPCa. Descriptive statistics with 95% confidence intervals (CI) were used to determine any differences.
+                                                         RESULTS:
+                                                         We included 51 patients with FGB (59 PI-RADS 4 and 41% PI-RADS 5) and 227 patients with MRGB (34 PI-RADS 4 and 66% PI-RADS 5). Included patients had a median age of 69 years (IQR, 65-72) and a median PSA level of 11.0 ng/ml (IQR, 7.4-15.1) and a median age of 67 years (IQR, 61-70), the median PSA 12.8 ng/ml (IQR, 9.1-19.0) within the FGB and the MRGB group, respectively. Detection rates of csPCA did not differ significantly between FGB and MRGB, 49 vs. 61%, respectively.
+                                                         CONCLUSION:
+                                                         We did not detect significant differences between FGB and MRGB in the detection of csPCa. The differences in detection ratios between both biopsy techniques are narrow with an increasing lesion size. This study warrants further studies to optimize selection of best biopsy modality.},
   file = {Vend17c.pdf:pdf\\Vend17c.pdf:PDF},
   optnote = {DIAG, MAGIC, RADIOLOGY},
   pmid = {28871396},
@@ -32966,11 +32999,11 @@ @article{Vend18
   pages = {219-227},
   volume = {4},
   abstract = {CONTEXT: The main difference between the available magnetic resonance imaging-transrectal ultrasound (MRI-TRUS) fusion platforms for prostate biopsy is the method of image registration being either rigid or elastic. As elastic registration compensates for possible deformation caused by the introduction of an ultrasound probe for example, it is expected that it would perform better than rigid registration.
-                                                       OBJECTIVE: The aim of this meta-analysis is to compare rigid with elastic registration by calculating the detection odds ratio (OR) for both subgroups. The detection OR is defined as the ratio of the odds of detecting clinically significant prostate cancer (csPCa) by MRI-TRUS fusion biopsy compared with systematic TRUS biopsy. Secondary objectives were the OR for any PCa and the OR after pooling both registration techniques.
-                                                       EVIDENCE ACQUISITION: The electronic databases PubMed, Embase, and Cochrane were systematically searched for relevant studies according to the Preferred Reporting Items for Systematic Review and Meta-analysis Statement. Studies comparing MRI-TRUS fusion and systematic TRUS-guided biopsies in the same patient were included. The quality assessment of included studies was performed using the Quality Assessment of Diagnostic Accuracy Studies version 2.
-                                                       EVIDENCE SYNTHESIS: Eleven papers describing elastic and 10 describing rigid registration were included. Meta-analysis showed an OR of csPCa for elastic and rigid registration of 1.45 (95% confidence interval [CI]: 1.21-1.73, p<0.0001) and 1.40 (95% CI: 1.13-1.75, p=0.002), respectively. No significant difference was seen between the subgroups (p=0.83). Pooling subgroups resulted in an OR of 1.43 (95% CI: 1.25-1.63, p<0.00001).
-                                                       CONCLUSIONS: No significant difference was identified between rigid and elastic registration for MRI-TRUS fusion-guided biopsy in the detection of csPCa; however, both techniques detected more csPCa than TRUS-guided biopsy alone.
-                                                       PATIENT SUMMARY: We did not identify any significant differences in prostate cancer detection between two distinct magnetic resonance imaging-transrectal ultrasound fusion systems which vary in their method of compensating for prostate deformation.},
+                                                         OBJECTIVE: The aim of this meta-analysis is to compare rigid with elastic registration by calculating the detection odds ratio (OR) for both subgroups. The detection OR is defined as the ratio of the odds of detecting clinically significant prostate cancer (csPCa) by MRI-TRUS fusion biopsy compared with systematic TRUS biopsy. Secondary objectives were the OR for any PCa and the OR after pooling both registration techniques.
+                                                         EVIDENCE ACQUISITION: The electronic databases PubMed, Embase, and Cochrane were systematically searched for relevant studies according to the Preferred Reporting Items for Systematic Review and Meta-analysis Statement. Studies comparing MRI-TRUS fusion and systematic TRUS-guided biopsies in the same patient were included. The quality assessment of included studies was performed using the Quality Assessment of Diagnostic Accuracy Studies version 2.
+                                                         EVIDENCE SYNTHESIS: Eleven papers describing elastic and 10 describing rigid registration were included. Meta-analysis showed an OR of csPCa for elastic and rigid registration of 1.45 (95% confidence interval [CI]: 1.21-1.73, p<0.0001) and 1.40 (95% CI: 1.13-1.75, p=0.002), respectively. No significant difference was seen between the subgroups (p=0.83). Pooling subgroups resulted in an OR of 1.43 (95% CI: 1.25-1.63, p<0.00001).
+                                                         CONCLUSIONS: No significant difference was identified between rigid and elastic registration for MRI-TRUS fusion-guided biopsy in the detection of csPCa; however, both techniques detected more csPCa than TRUS-guided biopsy alone.
+                                                         PATIENT SUMMARY: We did not identify any significant differences in prostate cancer detection between two distinct magnetic resonance imaging-transrectal ultrasound fusion systems which vary in their method of compensating for prostate deformation.},
   file = {:pdf/Vend18.pdf:PDF},
   journal = EUF,
   optnote = {DIAG, MAGIC, RADIOLOGY},
@@ -33000,14 +33033,14 @@ @article{Vend24
   url = {http://dx.doi.org/10.1371/journal.pone.0301969},
   volume = {19},
   abstract = {<jats:sec id="sec001">
-       Purpose
-       This study aims to introduce an innovative multi-step pipeline for automatic tumor-stroma ratio (TSR) quantification as a potential prognostic marker for pancreatic cancer, addressing the limitations of existing staging systems and the lack of commonly used prognostic biomarkers. <jats:sec id="sec002">
-       Methods
-       The proposed approach involves a deep-learning-based method for the automatic segmentation of tumor epithelial cells, tumor bulk, and stroma from whole-slide images (WSIs). Models were trained using five-fold cross-validation and evaluated on an independent external test set. TSR was computed based on the segmented components. Additionally, TSR's predictive value for six-month survival on the independent external dataset was assessed. <jats:sec id="sec003">
-       Results
-       Median Dice (inter-quartile range (IQR)) of 0.751(0.15) and 0.726(0.25) for tumor epithelium segmentation on internal and external test sets, respectively. Median Dice of 0.76(0.11) and 0.863(0.17) for tumor bulk segmentation on internal and external test sets, respectively. TSR was evaluated as an independent prognostic marker, demonstrating a cross-validation AUC of 0.61+-0.12 for predicting six-month survival on the external dataset. <jats:sec id="sec004">
-       Conclusion
-       Our pipeline for automatic TSR quantification offers promising potential as a prognostic marker for pancreatic cancer. The results underscore the feasibility of computational biomarker discovery in enhancing patient outcome prediction, thus contributing to personalized patient management.},
+         Purpose
+         This study aims to introduce an innovative multi-step pipeline for automatic tumor-stroma ratio (TSR) quantification as a potential prognostic marker for pancreatic cancer, addressing the limitations of existing staging systems and the lack of commonly used prognostic biomarkers. <jats:sec id="sec002">
+         Methods
+         The proposed approach involves a deep-learning-based method for the automatic segmentation of tumor epithelial cells, tumor bulk, and stroma from whole-slide images (WSIs). Models were trained using five-fold cross-validation and evaluated on an independent external test set. TSR was computed based on the segmented components. Additionally, TSR's predictive value for six-month survival on the independent external dataset was assessed. <jats:sec id="sec003">
+         Results
+         Median Dice (inter-quartile range (IQR)) of 0.751(0.15) and 0.726(0.25) for tumor epithelium segmentation on internal and external test sets, respectively. Median Dice of 0.76(0.11) and 0.863(0.17) for tumor bulk segmentation on internal and external test sets, respectively. TSR was evaluated as an independent prognostic marker, demonstrating a cross-validation AUC of 0.61+-0.12 for predicting six-month survival on the external dataset. <jats:sec id="sec004">
+         Conclusion
+         Our pipeline for automatic TSR quantification offers promising potential as a prognostic marker for pancreatic cancer. The results underscore the feasibility of computational biomarker discovery in enhancing patient outcome prediction, thus contributing to personalized patient management.},
   all_ss_ids = {['3f15ff01bb8284b3cbb989d3188b9f21e1a87391']},
   automatic = {yes},
   citation-count = {0},
@@ -33076,32 +33109,32 @@ @conference{Venh17
   booktitle = ARVO,
   title = {Fully automated detection of hyperreflective foci in optical coherence tomography},
   abstract = {Purpose: Diabetic macular edema ({DME}) is a retinal disorder characterized by a buildup of cystoidal fluid in the retina.
-                                                       The typical treatment consists of monthly intravitreal anti vascular endothelial growth factor (anti-VEGF) injections.
-                                                       However, the efficacy of this treatment varies strongly.
-                                                       Recent studies have indicated that the presence and number of hyperreflective foci can possibly be considered a prognostic biomarker for treatment response in {DME}.
-                                                       As the detection of foci is difficult and time-consuming manual foci quantification seems infeasible.
-                                                       We therefore developed a fully automated system capable of detecting and quantifying foci in optical coherence tomography ({OCT}) images.
-                                                       Methods:
-                                                        119 fovea centered B-scans obtained from 49 patients with {DME} were selected from a clinical database.
-                                                       The data was divided in a training set of 96 {B}-scans from 40 patients, and a test set containing 23 {B}-scans from 9 patients.
-                                                       A convolutional neural network ({CNN}) was developed to predict if an image pixel belongs to a hyperreflective focus by considering a small neighborhood around the pixel of interest.
-                                                       The {CNN} consists of 7 convolutional layers and 2 max pooling layers.
-                                                       After providing the system with enough training samples, the network automatically detects pixels with a high probability of being part of a hyperreflective focus.
-                                                       Connected detections are considered as a single detection.
-                                                       The obtained results were compared to manual annotations made by two experienced human graders in consensus for the central 3 mm surrounding the fovea.
-                                                       Hyperreflective foci were only annotated in the layers ranging from the inner plexiform layer ({IPL}) to the outer nuclear layer ({ONL}) as manual detection is challenging in the other layers.
-                                                       When a detection is overlapping with an annotated focus it is considered a true positive, otherwise it is counted as a false positive.
-
-                                                       Results:
-
-                                                       In the independent test set a sensitivity of 0.83 was obtained.
-                                                       At this level of sensitivity, an average of 8.3 false positives per {B}-scan were detected.
-                                                       False positives were mainly caused by detections outside the selected range ({ILP} to {ONL}) and misdetections by the graders.
-
-                                                       Conclusions:
-
-                                                       An image analysis algorithm for the automatic detection and quantification of hyperreflective foci in {OCT} {B}-scans was developed.
-                                                       The experiments show promising results to obtain quantitative foci based biomarkers that can be used for the prediction of treatment response in {DME}.},
+                                                         The typical treatment consists of monthly intravitreal anti vascular endothelial growth factor (anti-VEGF) injections.
+                                                         However, the efficacy of this treatment varies strongly.
+                                                         Recent studies have indicated that the presence and number of hyperreflective foci can possibly be considered a prognostic biomarker for treatment response in {DME}.
+                                                         As the detection of foci is difficult and time-consuming manual foci quantification seems infeasible.
+                                                         We therefore developed a fully automated system capable of detecting and quantifying foci in optical coherence tomography ({OCT}) images.
+                                                         Methods:
+                                                          119 fovea centered B-scans obtained from 49 patients with {DME} were selected from a clinical database.
+                                                         The data was divided in a training set of 96 {B}-scans from 40 patients, and a test set containing 23 {B}-scans from 9 patients.
+                                                         A convolutional neural network ({CNN}) was developed to predict if an image pixel belongs to a hyperreflective focus by considering a small neighborhood around the pixel of interest.
+                                                         The {CNN} consists of 7 convolutional layers and 2 max pooling layers.
+                                                         After providing the system with enough training samples, the network automatically detects pixels with a high probability of being part of a hyperreflective focus.
+                                                         Connected detections are considered as a single detection.
+                                                         The obtained results were compared to manual annotations made by two experienced human graders in consensus for the central 3 mm surrounding the fovea.
+                                                         Hyperreflective foci were only annotated in the layers ranging from the inner plexiform layer ({IPL}) to the outer nuclear layer ({ONL}) as manual detection is challenging in the other layers.
+                                                         When a detection is overlapping with an annotated focus it is considered a true positive, otherwise it is counted as a false positive.
+  
+                                                         Results:
+  
+                                                         In the independent test set a sensitivity of 0.83 was obtained.
+                                                         At this level of sensitivity, an average of 8.3 false positives per {B}-scan were detected.
+                                                         False positives were mainly caused by detections outside the selected range ({ILP} to {ONL}) and misdetections by the graders.
+  
+                                                         Conclusions:
+  
+                                                         An image analysis algorithm for the automatic detection and quantification of hyperreflective foci in {OCT} {B}-scans was developed.
+                                                         The experiments show promising results to obtain quantitative foci based biomarkers that can be used for the prediction of treatment response in {DME}.},
   optnote = {DIAG, RADIOLOGY},
   year = {2017},
 }
@@ -33116,18 +33149,18 @@ @article{Venh17a
   pages = {2318-2328},
   doi = {10.1167/iovs.16-20541},
   abstract = {Purpose: To evaluate a machine learning algorithm that automatically grades age-related macular degeneration (AMD) severity stages from optical coherence tomography (OCT) scans.
-                                                       Methods: A total of 3265 {OCT} scans from 1016 patients with either no signs of {AMD} or with signs of early, intermediate, or advanced {AMD} were randomly selected from a large European multicenter database.
-                                                       A machine learning system was developed to automatically grade unseen {OCT} scans into different {AMD} severity stages without requiring retinal layer segmentation.
-                                                       The ability of the system to identify high-risk {AMD} stages and to assign the correct severity stage was determined by using receiver operator characteristic ({ROC}) analysis and {C}ohen's Kappa statistics, respectively.
-                                                       The results were compared to those of two human observers.
-                                                       Reproducibility was assessed in an independent, publicly available data set of 384 {OCT} scans.
-                                                       Results:
-                                                        The system achieved an area under the {ROC} curve of 0.980 with a sensitivity of 98.2% at a specificity of 91.2%.
-                                                       This compares favorably with the performance of human observers who achieved sensitivities of 97.0% and 99.4% at specificities of 89.7% and 87.2%, respectively.
-                                                       A good level of agreement with the reference was obtained (Kappa = 0.713) and was in concordance with the human observers (Kappa = 0.775 and Kappa = 0.755, respectively).
-                                                       Conclusions:
-                                                        A machine learning system capable of automatically grading {OCT} scans into {AMD} severity stages was developed and showed similar performance as human observers.
-                                                       The proposed automatic system allows for a quick and reliable grading of large quantities of {OCT} scans, which could increase the efficiency of large-scale AMD studies and pave the way for {AMD} screening using {OCT}.},
+                                                         Methods: A total of 3265 {OCT} scans from 1016 patients with either no signs of {AMD} or with signs of early, intermediate, or advanced {AMD} were randomly selected from a large European multicenter database.
+                                                         A machine learning system was developed to automatically grade unseen {OCT} scans into different {AMD} severity stages without requiring retinal layer segmentation.
+                                                         The ability of the system to identify high-risk {AMD} stages and to assign the correct severity stage was determined by using receiver operator characteristic ({ROC}) analysis and {C}ohen's Kappa statistics, respectively.
+                                                         The results were compared to those of two human observers.
+                                                         Reproducibility was assessed in an independent, publicly available data set of 384 {OCT} scans.
+                                                         Results:
+                                                          The system achieved an area under the {ROC} curve of 0.980 with a sensitivity of 98.2% at a specificity of 91.2%.
+                                                         This compares favorably with the performance of human observers who achieved sensitivities of 97.0% and 99.4% at specificities of 89.7% and 87.2%, respectively.
+                                                         A good level of agreement with the reference was obtained (Kappa = 0.713) and was in concordance with the human observers (Kappa = 0.775 and Kappa = 0.755, respectively).
+                                                         Conclusions:
+                                                          A machine learning system capable of automatically grading {OCT} scans into {AMD} severity stages was developed and showed similar performance as human observers.
+                                                         The proposed automatic system allows for a quick and reliable grading of large quantities of {OCT} scans, which could increase the efficiency of large-scale AMD studies and pave the way for {AMD} screening using {OCT}.},
   file = {Venh17a.pdf:pdf\\Venh17a.pdf:PDF},
   optnote = {DIAG},
   pmid = {28437528},
@@ -33148,10 +33181,10 @@ @article{Venh17b
   pages = {3292-3316},
   doi = {10.1364/BOE.8.003292},
   abstract = {We developed a fully automated system using a convolutional neural network ({CNN}) for total retina segmentation in optical coherence tomography ({OCT}) that is robust to the presence of severe retinal pathology.
-                                                       A generalized U-net network architecture was introduced to include the large context needed to account for large retinal changes.
-                                                       The proposed algorithm outperformed qualitative and quantitatively two available algorithms.
-                                                       The algorithm accurately estimated macular thickness with an error of 14.0 +- 22.1 micrometer, substantially lower than the error obtained using the other algorithms (42.9 +- 22.1 micrometer and 27.1 +- 69.3 micrometer, respectively).
-                                                       These results highlighted the proposed algorithm's capability of modeling the wide variability in retinal appearance and obtained a robust and reliable retina segmentation even in severe pathological cases.},
+                                                         A generalized U-net network architecture was introduced to include the large context needed to account for large retinal changes.
+                                                         The proposed algorithm outperformed qualitative and quantitatively two available algorithms.
+                                                         The algorithm accurately estimated macular thickness with an error of 14.0 +- 22.1 micrometer, substantially lower than the error obtained using the other algorithms (42.9 +- 22.1 micrometer and 27.1 +- 69.3 micrometer, respectively).
+                                                         These results highlighted the proposed algorithm's capability of modeling the wide variability in retinal appearance and obtained a robust and reliable retina segmentation even in severe pathological cases.},
   file = {Venh17b.pdf:pdf\\Venh17b.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {28717568},
@@ -33247,10 +33280,10 @@ @conference{Venk21a
   booktitle = RSNA,
   title = {Integration Of A Deep Learning Algorithm Into The Clinically Established PanCan Model For Malignancy Risk Estimation Of Screen-detected Pulmonary Nodules In First Screening CT},
   abstract = {PURPOSE: To quantify the added value of integrating a deep learning algorithm (DLA)'s output to the existing Pan-Canadian Early Detection of Lung Cancer Study (PanCan) models for estimating malignancy risk of screen-detected pulmonary nodules.
-                             METHODS AND MATERIALS: Our DLA was trained on a cohort of 14,828 benign and 1,249 malignant nodules from the National Lung Screening Trial. In the present study, we derived a new multivariable logistic regression model on the PanCan data that included the DLA risk score and the original variables from the PanCan model 2b except for "nodule type" and "spiculation" as these are already encoded in the DLA risk score. The new model was externally validated on baseline nodules from the Danish Lung Cancer Screening Trial (DLCST). For comparison, the performances of the existing PanCan model 2b and of our DLA stand-alone were also calculated.
-                             RESULTS: 6024 benign and 86 malignant nodules from the PanCan data were included as the development set, and 818 benign and 34 malignant nodules from the Danish Lung Cancer Screening Trial (DLCST) were included as the validation set. The area under the receiver operating characteristic curve (AUC) for the DLA, PanCan model 2b, and the new model in the PanCan cohort were 0.944 (95% confidence interval = 0.917 - 0.968), 0.941 (0.908 - 0.969), and 0.944 (0.909 - 0.975), respectively. In the DLCST cohort, the AUCs were 0.917 (0.851 - 0.968), 0.896 (0.841 - 0.944), and 0.927 (0.878 - 0.969), respectively.
-                             CONCLUSIONS: Using our DLA risk score to derive a new multivariable logistic regression model on the PanCan data does not appear to significantly improve the predictive performance in high-risk screening participants, but may serve as a replacement for the "nodule type" and "spiculation" parameters that are known to have substantial interobserver variability.
-                             CLINICAL RELEVANCE / APPLICATION: Our DLA has a comparable nodule malignancy risk estimation performance to the PanCan models. This may help to make the computation of nodule risk scores easier and less subjective.},
+                               METHODS AND MATERIALS: Our DLA was trained on a cohort of 14,828 benign and 1,249 malignant nodules from the National Lung Screening Trial. In the present study, we derived a new multivariable logistic regression model on the PanCan data that included the DLA risk score and the original variables from the PanCan model 2b except for "nodule type" and "spiculation" as these are already encoded in the DLA risk score. The new model was externally validated on baseline nodules from the Danish Lung Cancer Screening Trial (DLCST). For comparison, the performances of the existing PanCan model 2b and of our DLA stand-alone were also calculated.
+                               RESULTS: 6024 benign and 86 malignant nodules from the PanCan data were included as the development set, and 818 benign and 34 malignant nodules from the Danish Lung Cancer Screening Trial (DLCST) were included as the validation set. The area under the receiver operating characteristic curve (AUC) for the DLA, PanCan model 2b, and the new model in the PanCan cohort were 0.944 (95% confidence interval = 0.917 - 0.968), 0.941 (0.908 - 0.969), and 0.944 (0.909 - 0.975), respectively. In the DLCST cohort, the AUCs were 0.917 (0.851 - 0.968), 0.896 (0.841 - 0.944), and 0.927 (0.878 - 0.969), respectively.
+                               CONCLUSIONS: Using our DLA risk score to derive a new multivariable logistic regression model on the PanCan data does not appear to significantly improve the predictive performance in high-risk screening participants, but may serve as a replacement for the "nodule type" and "spiculation" parameters that are known to have substantial interobserver variability.
+                               CLINICAL RELEVANCE / APPLICATION: Our DLA has a comparable nodule malignancy risk estimation performance to the PanCan models. This may help to make the computation of nodule risk scores easier and less subjective.},
   optnote = {DIAG, RADIOLOGY},
   year = {2021},
 }
@@ -33260,18 +33293,18 @@ @conference{Venk22
   booktitle = ECR,
   title = {Deep learning for estimating pulmonary nodule malignancy risk using prior CT examinations in lung cancer screening},
   abstract = {Purpose or Learning Objective: Nodule size, morphology, and growth are important factors for accurately estimating nodule malignancy risk in lung cancer screening CT examinations. In this work, we aimed to develop a deep learning (DL) algorithm that uses a current and a prior CT examination to estimate the malignancy risk of pulmonary nodules.
-
-                             Methods or Background: We developed a dual time-point DL algorithm by stacking the nodules from the current and prior CT examinations in the input channels of convolutional neural networks. We used 3,011 nodules (286 malignant) and 994 nodules (73 malignant) as development and hold-out test cohorts from the National Lung Screening Trial, respectively. The reference standard was set by histopathologic confirmation or CT follow-up of more than two years. We compared the performance of the algorithm against PanCan model 2b and a previously published single time-point DL algorithm that only processed a single CT examination. We used the area under the receiver operating characteristic curve (AUC) to measure discrimination performance and a standard permutation test with 10,000 random permutations to compute p-values.
-
-                             Results or Findings: The dual time-point DL algorithm achieved an AUC of 0.94 (95% CI: 0.91 - 0.97) on the hold-out test cohort. The algorithm outperformed the single time-point DL algorithm and the PanCan model, which had AUCs of 0.92 (95% CI: 0.89 - 0.95; p = 0.055) and 0.88 (95% CI: 0.85 - 0.91; p < 0.001), respectively.
-
-                             Conclusion: Deep learning algorithms using current and prior CT examinations have the potential to accurately estimate the malignancy risk of pulmonary nodules.
-
-                             Limitations: External validation is needed on other screening datasets to generate further evidence.
-
-                             Ethics committee approval: Institutional review board approval was obtained at each of the 33 centers involved in the NLST.
-
-                             Funding for this study: Research grant from MeVis Medical Solutions AG.},
+  
+                               Methods or Background: We developed a dual time-point DL algorithm by stacking the nodules from the current and prior CT examinations in the input channels of convolutional neural networks. We used 3,011 nodules (286 malignant) and 994 nodules (73 malignant) as development and hold-out test cohorts from the National Lung Screening Trial, respectively. The reference standard was set by histopathologic confirmation or CT follow-up of more than two years. We compared the performance of the algorithm against PanCan model 2b and a previously published single time-point DL algorithm that only processed a single CT examination. We used the area under the receiver operating characteristic curve (AUC) to measure discrimination performance and a standard permutation test with 10,000 random permutations to compute p-values.
+  
+                               Results or Findings: The dual time-point DL algorithm achieved an AUC of 0.94 (95% CI: 0.91 - 0.97) on the hold-out test cohort. The algorithm outperformed the single time-point DL algorithm and the PanCan model, which had AUCs of 0.92 (95% CI: 0.89 - 0.95; p = 0.055) and 0.88 (95% CI: 0.85 - 0.91; p < 0.001), respectively.
+  
+                               Conclusion: Deep learning algorithms using current and prior CT examinations have the potential to accurately estimate the malignancy risk of pulmonary nodules.
+  
+                               Limitations: External validation is needed on other screening datasets to generate further evidence.
+  
+                               Ethics committee approval: Institutional review board approval was obtained at each of the 33 centers involved in the NLST.
+  
+                               Funding for this study: Research grant from MeVis Medical Solutions AG.},
   optnote = {DIAG, RADIOLOGY},
   year = {2022},
 }
@@ -33285,19 +33318,19 @@ @article{Venk23
   number = {2},
   algorithm = {https://grand-challenge.org/algorithms/temporal-nodule-analysis/},
   abstract = {Background
-                          Prior chest CT provides valuable temporal information (eg, changes in nodule size or appearance) to accurately estimate malignancy risk.
-
-                          Purpose
-                          To develop a deep learning (DL) algorithm that uses a current and prior low-dose CT examination to estimate 3-year malignancy risk of pulmonary nodules.
-
-                          Materials and Methods
-                          In this retrospective study, the algorithm was trained using National Lung Screening Trial data (collected from 2002 to 2004), wherein patients were imaged at most 2 years apart, and evaluated with two external test sets from the Danish Lung Cancer Screening Trial (DLCST) and the Multicentric Italian Lung Detection Trial (MILD), collected in 2004-2010 and 2005-2014, respectively. Performance was evaluated using area under the receiver operating characteristic curve (AUC) on cancer-enriched subsets with size-matched benign nodules imaged 1 and 2 years apart from DLCST and MILD, respectively. The algorithm was compared with a validated DL algorithm that only processed a single CT examination and the Pan-Canadian Early Lung Cancer Detection Study (PanCan) model.
-
-                          Results
-                          The training set included 10 508 nodules (422 malignant) in 4902 trial participants (mean age, 64 years +- 5 [SD]; 2778 men). The size-matched external test sets included 129 nodules (43 malignant) and 126 nodules (42 malignant). The algorithm achieved AUCs of 0.91 (95% CI: 0.85, 0.97) and 0.94 (95% CI: 0.89, 0.98). It significantly outperformed the DL algorithm that only processed a single CT examination (AUC, 0.85 [95% CI: 0.78, 0.92; P = .002]; and AUC, 0.89 [95% CI: 0.84, 0.95; P = .01]) and the PanCan model (AUC, 0.64 [95% CI: 0.53, 0.74; P < .001]; and AUC, 0.63 [95% CI: 0.52, 0.74; P < .001]).
-
-                          Conclusion
-                          A DL algorithm using current and prior low-dose CT examinations was more effective at estimating 3-year malignancy risk of pulmonary nodules than established models that only use a single CT examination.},
+                            Prior chest CT provides valuable temporal information (eg, changes in nodule size or appearance) to accurately estimate malignancy risk.
+  
+                            Purpose
+                            To develop a deep learning (DL) algorithm that uses a current and prior low-dose CT examination to estimate 3-year malignancy risk of pulmonary nodules.
+  
+                            Materials and Methods
+                            In this retrospective study, the algorithm was trained using National Lung Screening Trial data (collected from 2002 to 2004), wherein patients were imaged at most 2 years apart, and evaluated with two external test sets from the Danish Lung Cancer Screening Trial (DLCST) and the Multicentric Italian Lung Detection Trial (MILD), collected in 2004-2010 and 2005-2014, respectively. Performance was evaluated using area under the receiver operating characteristic curve (AUC) on cancer-enriched subsets with size-matched benign nodules imaged 1 and 2 years apart from DLCST and MILD, respectively. The algorithm was compared with a validated DL algorithm that only processed a single CT examination and the Pan-Canadian Early Lung Cancer Detection Study (PanCan) model.
+  
+                            Results
+                            The training set included 10 508 nodules (422 malignant) in 4902 trial participants (mean age, 64 years +- 5 [SD]; 2778 men). The size-matched external test sets included 129 nodules (43 malignant) and 126 nodules (42 malignant). The algorithm achieved AUCs of 0.91 (95% CI: 0.85, 0.97) and 0.94 (95% CI: 0.89, 0.98). It significantly outperformed the DL algorithm that only processed a single CT examination (AUC, 0.85 [95% CI: 0.78, 0.92; P = .002]; and AUC, 0.89 [95% CI: 0.84, 0.95; P = .01]) and the PanCan model (AUC, 0.64 [95% CI: 0.53, 0.74; P < .001]; and AUC, 0.63 [95% CI: 0.52, 0.74; P < .001]).
+  
+                            Conclusion
+                            A DL algorithm using current and prior low-dose CT examinations was more effective at estimating 3-year malignancy risk of pulmonary nodules than established models that only use a single CT examination.},
   citation-count = {0},
   file = {Venk23.pdf:pdf\Venk23.pdf:PDF},
   journal = {Radiology},
@@ -33329,13 +33362,13 @@ @conference{Vent20
   booktitle = ARVO,
   title = {Estimating Uncertainty of Deep Neural Networks for Age-related Macular Degeneration Grading using Optical Coherence Tomography},
   abstract = {Purpose: Deep convolutional neural networks (CNNs) are increasingly being used for eye disease screening and diagnosis. Especially the best performing variants, however, are generally overconfident in their predictions. For usefulness in clinical practice and increasing clinicians' trust on the estimated diagnosis, well-calibrated uncertainty estimates are necessary. We present a method for providing confidence scores of CNNs for age-related macular degeneration (AMD) grading in optical coherence tomography (OCT).
-
-
-                                                       Methods: 1,264 OCT volumes from 633 patients from the European Genetic Database (EUGENDA) were graded as one of five stages of AMD (No AMD, Early AMD, Intermediate AMD, Advanced AMD: GA, and Advanced AMD: CNV). Ten different 3D DenseNet-121 models that take a full OCT volume as input were used to predict the corresponding AMD stage. These networks were all trained on the same dataset. However, each of these networks were initialized differently. The class with the maximum average softmax output of these models was used as the final prediction. The confidence measure was the normalized average softmax output for that class.
-
-                                                       Results: The algorithm achieved an area under the Receiver Operating Characteristic of 0.9785 and a quadratic-weighted kappa score of 0.8935. The mean uncertainty, calculated as 1 - the mean confidence score, for incorrect predictions was 1.9 times as high as the mean uncertainty for correct predictions. When only using the probability output of a single network, this ratio was 1.4. Another measure for uncertainty estimation performance is the Expected Calibration Error (ECE), where a lower value is better. When comparing the method to the probability output of a single network, the ECE improved from 0.0971 to 0.0324. Figure 1 shows examples of both confident and unconfident predictions.
-
-                                                       Conclusions: We present a method for improving uncertainty estimation for AMD grading in OCT, by combining the output of multiple individually trained CNNs. This increased reliability of system confidences can contribute to building trust in CNNs for retinal disease screening. Furthermore, this technique is a first step towards selective prediction in retinal disease screening, where only cases with high uncertainty predictions need to be referred for expert evaluation.},
+  
+  
+                                                         Methods: 1,264 OCT volumes from 633 patients from the European Genetic Database (EUGENDA) were graded as one of five stages of AMD (No AMD, Early AMD, Intermediate AMD, Advanced AMD: GA, and Advanced AMD: CNV). Ten different 3D DenseNet-121 models that take a full OCT volume as input were used to predict the corresponding AMD stage. These networks were all trained on the same dataset. However, each of these networks were initialized differently. The class with the maximum average softmax output of these models was used as the final prediction. The confidence measure was the normalized average softmax output for that class.
+  
+                                                         Results: The algorithm achieved an area under the Receiver Operating Characteristic of 0.9785 and a quadratic-weighted kappa score of 0.8935. The mean uncertainty, calculated as 1 - the mean confidence score, for incorrect predictions was 1.9 times as high as the mean uncertainty for correct predictions. When only using the probability output of a single network, this ratio was 1.4. Another measure for uncertainty estimation performance is the Expected Calibration Error (ECE), where a lower value is better. When comparing the method to the probability output of a single network, the ECE improved from 0.0971 to 0.0324. Figure 1 shows examples of both confident and unconfident predictions.
+  
+                                                         Conclusions: We present a method for improving uncertainty estimation for AMD grading in OCT, by combining the output of multiple individually trained CNNs. This increased reliability of system confidences can contribute to building trust in CNNs for retinal disease screening. Furthermore, this technique is a first step towards selective prediction in retinal disease screening, where only cases with high uncertainty predictions need to be referred for expert evaluation.},
   optnote = {DIAG, RADIOLOGY},
   year = {2020},
   month = {6},
@@ -33367,12 +33400,12 @@ @conference{Vent21a
   url = {https://iovs.arvojournals.org/article.aspx?articleid=2775505},
   title = {Making AI Transferable Across OCT Scanners from Different Vendors},
   abstract = {Purpose: Deep neural networks (DNNs) for optical coherence tomography (OCT) classification have been proven to work well on images from scanners that were used during training. However, since the appearance of OCT scans can differ greatly between vendors, these DNNs often fail when they are applied to scans from different manufacturers. We propose a DNN architecture for age-related macular degeneration (AMD) grading that maintains performance on OCTs from vendors not included during training.
-
-                               Methods: 2,598 and 680 Heidelberg Spectralis OCT scans from the European Genetic Database were used for development and testing, respectively. We tested transferability with 339 AMD-enriched Topcon OCTs from the Rotterdam Study. AMD severity classification was determined manually in accordance with the Cologne Image Reading Center and Laboratory and Rotterdam Classification, respectively. Classifications were harmonized for the evaluation of the DNNs. The proposed DNN considers each B-scan separately using a 2D ResNet-18, and internally combines the intermediate outputs related to each B-scan using a multiple instance learning approach. Even though the proposed DNN provides both B-scan level and OCT-volume level decisions, the architecture is trained end-to-end using only full volume gradings. This specific architecture makes our method robust to the variability of scanning protocols across vendors, as it is invariant to B-scan spacing. We compare this approach to a baseline that classifies the full OCT scan directly using a 3D ResNet-18.
-
-                               Results: The quadratic weighted kappa (QWK) for the baseline method dropped from 0.852 on the Heidelberg Spectralis dataset to 0.523 on the Topcon dataset. This QWK drop was smaller (p = 0.001) for our approach, which dropped from 0.849 to 0.717. The difference in area under the Receiver Operating Characteristic (AUC) drop was also smaller (p < 0.001) for our approach (0.969 to 0.906, -6.5%) than for the baseline method (0.971 to 0.806, -17.0%).
-
-                               Conclusions: We present a DNN for AMD classification on OCT scans that transfers well to scans from vendors that were not used for development. This alleviates the need for retraining on data from these scanner types, which is an expensive process in terms of data acquisition, model development, and human annotation time. Furthermore, this increases the applicability of AI for OCT classification in broader scopes than the settings in which they were developed.},
+  
+                                 Methods: 2,598 and 680 Heidelberg Spectralis OCT scans from the European Genetic Database were used for development and testing, respectively. We tested transferability with 339 AMD-enriched Topcon OCTs from the Rotterdam Study. AMD severity classification was determined manually in accordance with the Cologne Image Reading Center and Laboratory and Rotterdam Classification, respectively. Classifications were harmonized for the evaluation of the DNNs. The proposed DNN considers each B-scan separately using a 2D ResNet-18, and internally combines the intermediate outputs related to each B-scan using a multiple instance learning approach. Even though the proposed DNN provides both B-scan level and OCT-volume level decisions, the architecture is trained end-to-end using only full volume gradings. This specific architecture makes our method robust to the variability of scanning protocols across vendors, as it is invariant to B-scan spacing. We compare this approach to a baseline that classifies the full OCT scan directly using a 3D ResNet-18.
+  
+                                 Results: The quadratic weighted kappa (QWK) for the baseline method dropped from 0.852 on the Heidelberg Spectralis dataset to 0.523 on the Topcon dataset. This QWK drop was smaller (p = 0.001) for our approach, which dropped from 0.849 to 0.717. The difference in area under the Receiver Operating Characteristic (AUC) drop was also smaller (p < 0.001) for our approach (0.969 to 0.906, -6.5%) than for the baseline method (0.971 to 0.806, -17.0%).
+  
+                                 Conclusions: We present a DNN for AMD classification on OCT scans that transfers well to scans from vendors that were not used for development. This alleviates the need for retraining on data from these scanner types, which is an expensive process in terms of data acquisition, model development, and human annotation time. Furthermore, this increases the applicability of AI for OCT classification in broader scopes than the settings in which they were developed.},
   optnote = {DIAG, RADIOLOGY},
   year = {2021},
 }
@@ -33395,31 +33428,34 @@ @article{Vent23
   gscites = {11},
 }
 
-@article{Vent23a,
-  author = {de Vente, Coen and van Ginneken, Bram and Hoyng, Carel B. and Klaver, Caroline C. W. and S\'{a}nchez, Clara I.},
-  title = {Uncertainty-Aware Multiple-Instance Learning for Reliable Classification: Application to Optical Coherence Tomography},
-  doi = {10.48550/ARXIV.2302.03116},
-  year = {2023},
-  abstract = {Deep learning classification models for medical image analysis often perform well on data from scanners that were used during training. However, when these models are applied to data from different vendors, their performance tends to drop substantially. Artifacts that only occur within scans from specific scanners are major causes of this poor generalizability. We aimed to improve the reliability of deep learning classification models by proposing Uncertainty-Based Instance eXclusion (UBIX). This technique, based on multiple-instance learning, reduces the effect of corrupted instances on the bag-classification by seamlessly integrating out-of-distribution (OOD) instance detection during inference. Although UBIX is generally applicable to different medical images and diverse classification tasks, we focused on staging of age-related macular degeneration in optical coherence tomography. After being trained using images from one vendor, UBIX showed a reliable behavior, with a slight decrease in performance (a decrease of the quadratic weighted kappa ($\kappa$\textsubscript{w}) from 0.861 to 0.708), when applied to images from different vendors containing artifacts; while a state-of-the-art 3D neural network suffered from a significant detriment of performance ($\kappa$\textsubscript{w} from 0.852 to 0.084) on the same test set. We showed that instances with unseen artifacts can be identified with OOD detection and their contribution to the bag-level predictions can be reduced, improving reliability without the need for retraining on new data. This potentially increases the applicability of artificial intelligence models to data from other scanners than the ones for which they were developed.},
-  url = {https://arxiv.org/abs/2302.03116},
-  file = {Vent23a.pdf:pdf\\Vent23a.pdf:PDF},
+@article{Vent24,
+  author = {de Vente, Coen and van Ginneken, Bram and Hoyng, Carel B. and Klaver, Caroline C.W. and S\'{a}nchez, Clara I.},
+  title = {Uncertainty-aware multiple-instance learning for reliable classification: Application to optical coherence tomography},
+  doi = {10.1016/j.media.2024.103259},
+  year = {2024},
+  abstract = {Abstract unavailable},
+  url = {http://dx.doi.org/10.1016/j.media.2024.103259},
+  file = {Vent24.pdf:pdf\\Vent24.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
-  journal = {arXiv:2302.03116},
+  journal = {Medical Image Analysis},
   automatic = {yes},
-  all_ss_ids = {[362c510dec0d566d22d5be3af0519fc7eec8bb86]},
-  gscites = {1},
+  all_ss_ids = {['362c510dec0d566d22d5be3af0519fc7eec8bb86']},
+  citation-count = {0},
+  pages = {103259},
+  volume = {97},
+  pmid = {38959721},
 }
 
 @mastersthesis{Verb21,
   author = {Jeroen Verboom},
   title = {Deep Learning for Fracture Detection in the Radius and Ulna on Conventional Radiographs},
   abstract = {This work gives a compartmentalized overview of a fracture detection tool to detect and localize fractures in the radius and ulna on conventional radiographs using deep learning.
-                              This contrasts earlier studies in that it proposes a more efficient object detector, demonstrates the generalizability of fracture detection models to data from a different hospital, and employs more advanced class activation mapping methods for fracture localization.
-                              Both RadboudUMC and the Jeroen Bosch Ziekenhuis provided data to create a multi-institutional dataset.
-                              The two data sources enabled me to demonstrate how fracture detection classifiers trained on data from only one institution significantly perform less when tested on data from another institution.
-                              Moreover, this study demonstrates a more efficient bone localization method that yields adequate performance to be used for cropping regions of interest, and a newer fracture localization method (ScoreCAM) that outperforms its predecessors in terms of highlighting less redundant information.
-                              I conclude that the algorithms presented in this work show the potential to be incorporated in a clinically usable fracture detection tool.
-                              However, more research needs to be conducted using multi-institutional for training fracture detection classifiers.},
+                                This contrasts earlier studies in that it proposes a more efficient object detector, demonstrates the generalizability of fracture detection models to data from a different hospital, and employs more advanced class activation mapping methods for fracture localization.
+                                Both RadboudUMC and the Jeroen Bosch Ziekenhuis provided data to create a multi-institutional dataset.
+                                The two data sources enabled me to demonstrate how fracture detection classifiers trained on data from only one institution significantly perform less when tested on data from another institution.
+                                Moreover, this study demonstrates a more efficient bone localization method that yields adequate performance to be used for cropping regions of interest, and a newer fracture localization method (ScoreCAM) that outperforms its predecessors in terms of highlighting less redundant information.
+                                I conclude that the algorithms presented in this work show the potential to be incorporated in a clinically usable fracture detection tool.
+                                However, more research needs to be conducted using multi-institutional for training fracture detection classifiers.},
   file = {Verb21.pdf:pdf/Verb21.pdf:PDF},
   optnote = {DIAG},
   school = {Tilburg University},
@@ -33559,20 +33595,20 @@ @article{Vina23
   url = {http://dx.doi.org/10.1186/s12903-023-03362-8},
   volume = {23},
   abstract = {Abstract
-                                          Objective
-                                          Intra-oral scans and gypsum cast scans (OS) are widely used in orthodontics, prosthetics, implantology, and orthognathic surgery to plan patient-specific treatments, which require teeth segmentations with high accuracy and resolution. Manual teeth segmentation, the gold standard up until now, is time-consuming, tedious, and observer-dependent. This study aims to develop an automated teeth segmentation and labeling system using deep learning.
-
-                                          Material and methods
-                                          As a reference, 1750 OS were manually segmented and labeled. A deep-learning approach based on PointCNN and 3D U-net in combination with a rule-based heuristic algorithm and a combinatorial search algorithm was trained and validated on 1400 OS. Subsequently, the trained algorithm was applied to a test set consisting of 350 OS. The intersection over union (IoU), as a measure of accuracy, was calculated to quantify the degree of similarity between the annotated ground truth and the model predictions.
-
-                                          Results
-                                          The model achieved accurate teeth segmentations with a mean IoU score of 0.915. The FDI labels of the teeth were predicted with a mean accuracy of 0.894. The optical inspection showed excellent position agreements between the automatically and manually segmented teeth components. Minor flaws were mostly seen at the edges.
-
-                                          Conclusion
-                                          The proposed method forms a promising foundation for time-effective and observer-independent teeth segmentation and labeling on intra-oral scans.
-
-                                          Clinical significance
-                                          Deep learning may assist clinicians in virtual treatment planning in orthodontics, prosthetics, implantology, and orthognathic surgery. The impact of using such models in clinical practice should be explored.},
+                                            Objective
+                                            Intra-oral scans and gypsum cast scans (OS) are widely used in orthodontics, prosthetics, implantology, and orthognathic surgery to plan patient-specific treatments, which require teeth segmentations with high accuracy and resolution. Manual teeth segmentation, the gold standard up until now, is time-consuming, tedious, and observer-dependent. This study aims to develop an automated teeth segmentation and labeling system using deep learning.
+  
+                                            Material and methods
+                                            As a reference, 1750 OS were manually segmented and labeled. A deep-learning approach based on PointCNN and 3D U-net in combination with a rule-based heuristic algorithm and a combinatorial search algorithm was trained and validated on 1400 OS. Subsequently, the trained algorithm was applied to a test set consisting of 350 OS. The intersection over union (IoU), as a measure of accuracy, was calculated to quantify the degree of similarity between the annotated ground truth and the model predictions.
+  
+                                            Results
+                                            The model achieved accurate teeth segmentations with a mean IoU score of 0.915. The FDI labels of the teeth were predicted with a mean accuracy of 0.894. The optical inspection showed excellent position agreements between the automatically and manually segmented teeth components. Minor flaws were mostly seen at the edges.
+  
+                                            Conclusion
+                                            The proposed method forms a promising foundation for time-effective and observer-independent teeth segmentation and labeling on intra-oral scans.
+  
+                                            Clinical significance
+                                            Deep learning may assist clinicians in virtual treatment planning in orthodontics, prosthetics, implantology, and orthognathic surgery. The impact of using such models in clinical practice should be explored.},
   citation-count = {0},
   file = {Vina23.pdf:pdf\Vina23.pdf:PDF},
   journal = {BMC Oral Health},
@@ -33937,19 +33973,19 @@ @conference{Vree15a
   booktitle = RSNA,
   year = {2015},
   abstract = {PURPOSE
-                                                       The purpose of this study was to evaluate the visibility of MR screen detected cancers on prior MR examinations in a population with an elevated risk for breast cancer.
-
-                                                       METHOD AND MATERIALS
-                                                       An IRB approved, retrospective review of patient files from women screened with breast MRI between 2003 and 2013 was conducted at our academic center. We selected all cases detected in MRI with a prior negative MR examination performed between 6 and 24 months before a cancer was revealed (mean: 12.8 A-A?A 1/2  3.7 months). This yielded 43 cancers (3 invasive lobular-, 33 invasive ductal carcinomas, 5 ductal carcinoma in situ and 2 others) in 41 patients (age: 49 A-A?A 1/2  9.8 years, 21 BRCA patients). The MR scans where the cancers were detected (diagnostic MR scan) and the prior MR scans were evaluated side-by-side in consensus by two dedicated breast radiologists. The visibility of the cancers on prior scans was rated as: visible (BIRADS 4/5), minimal sign (BIRADS 2/3), or invisible (BIRADS 1). Chi-square tests were used to test the correlation between patient and cancer characteristics, image quality (IQ), background parenchymal enhancement (BPE), and visibility of the tumor in the prior MR scan.
-
-                                                       RESULTS
-                                                       All lesions were retrospectively evident on the diagnostic MR scan. Review of the prior examinations of the 43 cancers detected in follow-up rounds revealed that 11 lesions (26%) were visible in the prior MRI and should have been recalled at the time of this scan. 15 lesions (35%) showed a minimal sign in the prior MRI. Only 17 lesions (40%) were completely invisible. High grade, ER negative, and PR negative tumors were more often invisible in the prior scan (p=0.016, p=0.005, and p=0.002). Moreover, tumors in BRCA patients were more likely to be invisible in the prior scan, than in non-BRCA carriers (p=0.025). IQ and BPE were not significantly related to the visibility of tumors in the prior scan.
-
-                                                       CONCLUSION
-                                                       About 26% of the breast cancers could have been recalled earlier and only 40% of the breast cancers were invisible in retrospect.
-
-                                                       CLINICAL RELEVANCE/APPLICATION
-                                                       To prevent screening errors regular auditing of clinical practice is indicated. Moreover, like in mammography, structural double reading of MRI screening examinations may be recommended.},
+                                                         The purpose of this study was to evaluate the visibility of MR screen detected cancers on prior MR examinations in a population with an elevated risk for breast cancer.
+  
+                                                         METHOD AND MATERIALS
+                                                         An IRB approved, retrospective review of patient files from women screened with breast MRI between 2003 and 2013 was conducted at our academic center. We selected all cases detected in MRI with a prior negative MR examination performed between 6 and 24 months before a cancer was revealed (mean: 12.8 A-A?A 1/2  3.7 months). This yielded 43 cancers (3 invasive lobular-, 33 invasive ductal carcinomas, 5 ductal carcinoma in situ and 2 others) in 41 patients (age: 49 A-A?A 1/2  9.8 years, 21 BRCA patients). The MR scans where the cancers were detected (diagnostic MR scan) and the prior MR scans were evaluated side-by-side in consensus by two dedicated breast radiologists. The visibility of the cancers on prior scans was rated as: visible (BIRADS 4/5), minimal sign (BIRADS 2/3), or invisible (BIRADS 1). Chi-square tests were used to test the correlation between patient and cancer characteristics, image quality (IQ), background parenchymal enhancement (BPE), and visibility of the tumor in the prior MR scan.
+  
+                                                         RESULTS
+                                                         All lesions were retrospectively evident on the diagnostic MR scan. Review of the prior examinations of the 43 cancers detected in follow-up rounds revealed that 11 lesions (26%) were visible in the prior MRI and should have been recalled at the time of this scan. 15 lesions (35%) showed a minimal sign in the prior MRI. Only 17 lesions (40%) were completely invisible. High grade, ER negative, and PR negative tumors were more often invisible in the prior scan (p=0.016, p=0.005, and p=0.002). Moreover, tumors in BRCA patients were more likely to be invisible in the prior scan, than in non-BRCA carriers (p=0.025). IQ and BPE were not significantly related to the visibility of tumors in the prior scan.
+  
+                                                         CONCLUSION
+                                                         About 26% of the breast cancers could have been recalled earlier and only 40% of the breast cancers were invisible in retrospect.
+  
+                                                         CLINICAL RELEVANCE/APPLICATION
+                                                         To prevent screening errors regular auditing of clinical practice is indicated. Moreover, like in mammography, structural double reading of MRI screening examinations may be recommended.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -33959,19 +33995,19 @@ @conference{Vree15b
   booktitle = RSNA,
   year = {2015},
   abstract = {PURPOSE
-                                                       Breast cancer screening in women at elevated risk is performed with yearly MRI and mammography. This includes women with BRCA mutations and women at elevated risk for other causes (mainly family history). The purpose of this study was to assess differences between BRCA mutation carriers and non-BRCA patients in a longitudinal MRI screening program in terms of recall rate, positive predictive value, and detection.
-
-                                                       METHOD AND MATERIALS
-                                                       An IRB approved, retrospective review of patient files from women screened with breast MRI between 2003 and 2013 was performed at our academic center. We analysed 9.504 screening MR examinations in 2843 women (age: 45 A-A?A 1/2  12.09 years), including 761 BRCA patients, and 2082 non-BRCA patients. Recall rate (RR), positive predictive value (PPV), and cancer detection rate (CDR) were evaluated for first round examinations and follow-up examinations separately. BRCA patients were compared with non-BRCA patients. Chi-square tests were used to determine statistical significance.
-
-                                                       RESULTS
-                                                       The RR for BRCA patients in the first round of screening was 86.07 per 1000 examinations and 52.58 per 1000 examinations in non-BRCA patients (p<0.001). The PPV for BRCA patients in the first round of screening was found to be 0.44, compared to 0.50 in non-BRCA patients (p=0.013). The CDR was 38.25 per 1000 examinations for BRCA patients and 26.53 per 1000 examinations for non-BRCA patients (p<0.001). In follow up, the RR was found to be 24.92 per 1000 examinations for BRCA patients and 22.81 per 1000 examinations for non-BRCA patients (p<0.001). The PPV was 0.46 for BRCA patients and 0.21 for non-BRCA patients (p<0.001). CDR was 11.42 per 1000 examinations for BRCA patients and 4.86 per 1000 examinations for non-BRCA patients (p<0.001).
-
-                                                       CONCLUSION
-                                                       RR and CDR are high for all patients in the first round. RR and CDR significantly decreased in follow-up rounds (p<0.001). PPV remained at an acceptable level for both patient groups, and remains particularly high in BRCA carriers. RR, PPV, and CDR differed significantly between BRCA and non-BRCA patients in both first and follow up rounds.
-
-                                                       CLINICAL RELEVANCE/APPLICATION
-                                                       These results underline that MRI is an excellent tool for screening high risk patients. Cancer detection is very high in the first round in all patients, but remains high only in BRCA carriers in follow up rounds.},
+                                                         Breast cancer screening in women at elevated risk is performed with yearly MRI and mammography. This includes women with BRCA mutations and women at elevated risk for other causes (mainly family history). The purpose of this study was to assess differences between BRCA mutation carriers and non-BRCA patients in a longitudinal MRI screening program in terms of recall rate, positive predictive value, and detection.
+  
+                                                         METHOD AND MATERIALS
+                                                         An IRB approved, retrospective review of patient files from women screened with breast MRI between 2003 and 2013 was performed at our academic center. We analysed 9.504 screening MR examinations in 2843 women (age: 45 A-A?A 1/2  12.09 years), including 761 BRCA patients, and 2082 non-BRCA patients. Recall rate (RR), positive predictive value (PPV), and cancer detection rate (CDR) were evaluated for first round examinations and follow-up examinations separately. BRCA patients were compared with non-BRCA patients. Chi-square tests were used to determine statistical significance.
+  
+                                                         RESULTS
+                                                         The RR for BRCA patients in the first round of screening was 86.07 per 1000 examinations and 52.58 per 1000 examinations in non-BRCA patients (p<0.001). The PPV for BRCA patients in the first round of screening was found to be 0.44, compared to 0.50 in non-BRCA patients (p=0.013). The CDR was 38.25 per 1000 examinations for BRCA patients and 26.53 per 1000 examinations for non-BRCA patients (p<0.001). In follow up, the RR was found to be 24.92 per 1000 examinations for BRCA patients and 22.81 per 1000 examinations for non-BRCA patients (p<0.001). The PPV was 0.46 for BRCA patients and 0.21 for non-BRCA patients (p<0.001). CDR was 11.42 per 1000 examinations for BRCA patients and 4.86 per 1000 examinations for non-BRCA patients (p<0.001).
+  
+                                                         CONCLUSION
+                                                         RR and CDR are high for all patients in the first round. RR and CDR significantly decreased in follow-up rounds (p<0.001). PPV remained at an acceptable level for both patient groups, and remains particularly high in BRCA carriers. RR, PPV, and CDR differed significantly between BRCA and non-BRCA patients in both first and follow up rounds.
+  
+                                                         CLINICAL RELEVANCE/APPLICATION
+                                                         These results underline that MRI is an excellent tool for screening high risk patients. Cancer detection is very high in the first round in all patients, but remains high only in BRCA carriers in follow up rounds.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -33981,25 +34017,25 @@ @conference{Vree16
   booktitle = ISMRM,
   year = {2016},
   abstract = {Synopsis: Women at increased risk for breast cancer require annual mammography and MRI. The purpose of this study is to evaluate cancers detected in MRI screening and assess the visibility on prior MRI-examinations. MRI-scans of breast cancers detected in our MRI screening program were re-evaluated and lesions on the diagnostic MRI and prior MRI were scored according to Breast Imaging Reporting and Data (BI-RADS) MR-lexicon. The visibility of the lesions on the prior MRI was rated as visible, minimal sign and invisible. Our results show that almost one third of the breast cancers should have been recalled based on consensus review.
-                                                       Purpose:
-                                                       Breast cancer is a main cause of cancer death, especially in women at increased risk for breast cancer. This risk is defined as a cumulative lifetime risk of more than 20%, and can be as high as 57% at the age of 70 in BRCA1-carriers.1 Screening with only mammography is insufficient in these women and supplemental annual breast MRI is currently required.2 In mammography screening it is regular practice to evaluate breast cancers detected in incident screening rounds and cancers detected in between screening round (interval cancers), and assess whether these cancers could have been detected earlier.3,4 This is rare for MRI screening. The purpose of this study is to evaluate breast cancers detected in an intermediate and high risk screening program, and assess the visibility of these cancers on prior MRI examinations. To detect possible causes for non-recall, we investigated imaging, patient, and cancer characteristics.
-                                                       Methods:
-                                                       This retrospective study was approved by our local institutional board and the requirement for informed consent was waived. We collected all breast MRI screening examinations in the period from January 2003 - January 2014. To find all malignant lesions in this population, corresponding patient files were linked to the Netherlands Cancer Registry (NCR). For each patient with breast cancer detected on an MRI-screen or interval cancer, we identified whether another MRI-screen in 6 - 24 months before cancer detection was available (prior MRI). These MRI-scans were re-evaluated together with the MRI-scan in which cancer was detected (diagnostic MRI) in consensus by two radiologists with 8 and 12 years experience. The review was performed on an in-house developed breast MRI workstation, which provided T1-weighted images for all time points for both current and prior DCE-MRI, subtraction images, and their maximum intensity projection. Images were motion corrected using an algorithm described in Gubern-MA-A?A 1/2 rida et al.5 No T2-weighted images or diffusion-weighted images were shown. On the diagnostic MRI morphological and enhancement characteristics of the known cancer were scored according to the Breast Imaging Reporting and Data (BI-RADS) MR-lexicon.6 In addition, background parenchymal enhancement (BPE) was scored as minimal (<25%), mild (25-50%), moderate (50-75%) or marked (>75%), and image quality (IQ) was scored as perfect, sufficient or bad. Thereafter, the prior MRI was analyzed. The visibility of the lesion, previously identified in the diagnostic MRI, was rated as visible (BI-RADS 4/5), minimally visible (BI-RADS 2/3), or invisible (BI-RADS 1) (Fig.1). In lesions classified as visible or minimally visible morphology and enhancement characteristics were scored. Pearsons chi-square tests were used to test if imaging, patient, and cancer characteristics affect the visibility of the tumor on the prior MRI. Statistics were performed in SPSS.
-                                                       Results:
-                                                       From January 2003 - January 2014, 10120 MRI-examinations were performed in 2797 women, including 807 BRCA-mutation carriers. In total, 153 cancers were found. For 69 screen-detected tumors a prior MRI was available (36 tumors in patients with a BRCA mutation). In retrospect, 20 (29%) tumors were visible on the prior MRI, 26 (38%) showed a minimal sign, and 23 (33%) were invisible. Furthermore, prior MRIs were also available for 12 interval cancers (6 tumors in patients with a BRCA mutation); 3 (25%) were visible, 4 (33%) showed a minimal sign, and 5 (33%) were not visible on the prior MRI. Tumors in BRCA patients, small tumors, tumors of high grade and hormone-negative tumors were more likely to be invisible on the prior MRI (p<0.001, p=0.039, p<0.001, p<0.001, respectively). The lack of detection of lesions scored as visible on the prior MRI was not related to BPE or IQ.
-                                                        Discussion:
-                                                       A successful MRI screening program is based on the balance between the early detection and the false positive findings that result in unnecessary biopsies and anxiety. This might explain why not all visible lesions get recalled. However, in our study we show that almost one third of cancers were already visible on the prior MRI scan in retrospect and should have been recalled according to our consensus review. This fraction was similar for screen detected and interval cancers. A possible reason for the non-recall could be that the visible lesions were already present at an earlier time point and were regarded stable over time. Non-recall was not related to BPE or IQ.
-                                                       Conclusion:
-                                                       It was seen that 28% of breast cancers should have been recalled earlier based on consensus review. Only 35% was completely invisible in retrospect. This indicates that even highly specialized breast cancer screening programs can still be improved and that regular evaluation of screening practice is essential.
-                                                       References:
-                                                       1.S. Chen et al. Meta-analysis of BRCA1 and BRCA2 penetrance. JCO (2007), 25(11):1329-33
-                                                       2.D. Saslow et al. American Cancer Society guidelines for breast screening with MRI as an adjunct to mammography. CA (2007), 57:75-89
-                                                       3.D.M. Ikeda et al. Analysis of 172 subtle findings on prior normal mammograms in women with breast cancer detected at follow-up screening. Radiology (2003), 226(2):494-503
-                                                       4.A.J. Maxwell et al. A study of breast cancers detected in the incident round of the UK NHS Breast Screening Programme: the importance of early detection and treatment of ductal carcinoma in situ. Breast (2001), 10(5):392-8
-                                                       5.A. Gubern-Merida et al. Automated localization of breast cancer in DCE-MRI. Med Imag Anal (2015),20(1):265-74
-                                                       6.BI-RADS Atlas, 5th ed 2013
-
-                                                       Acknowledgements: European Unions 7FP (Grant 601040)},
+                                                         Purpose:
+                                                         Breast cancer is a main cause of cancer death, especially in women at increased risk for breast cancer. This risk is defined as a cumulative lifetime risk of more than 20%, and can be as high as 57% at the age of 70 in BRCA1-carriers.1 Screening with only mammography is insufficient in these women and supplemental annual breast MRI is currently required.2 In mammography screening it is regular practice to evaluate breast cancers detected in incident screening rounds and cancers detected in between screening round (interval cancers), and assess whether these cancers could have been detected earlier.3,4 This is rare for MRI screening. The purpose of this study is to evaluate breast cancers detected in an intermediate and high risk screening program, and assess the visibility of these cancers on prior MRI examinations. To detect possible causes for non-recall, we investigated imaging, patient, and cancer characteristics.
+                                                         Methods:
+                                                         This retrospective study was approved by our local institutional board and the requirement for informed consent was waived. We collected all breast MRI screening examinations in the period from January 2003 - January 2014. To find all malignant lesions in this population, corresponding patient files were linked to the Netherlands Cancer Registry (NCR). For each patient with breast cancer detected on an MRI-screen or interval cancer, we identified whether another MRI-screen in 6 - 24 months before cancer detection was available (prior MRI). These MRI-scans were re-evaluated together with the MRI-scan in which cancer was detected (diagnostic MRI) in consensus by two radiologists with 8 and 12 years experience. The review was performed on an in-house developed breast MRI workstation, which provided T1-weighted images for all time points for both current and prior DCE-MRI, subtraction images, and their maximum intensity projection. Images were motion corrected using an algorithm described in Gubern-MA-A?A 1/2 rida et al.5 No T2-weighted images or diffusion-weighted images were shown. On the diagnostic MRI morphological and enhancement characteristics of the known cancer were scored according to the Breast Imaging Reporting and Data (BI-RADS) MR-lexicon.6 In addition, background parenchymal enhancement (BPE) was scored as minimal (<25%), mild (25-50%), moderate (50-75%) or marked (>75%), and image quality (IQ) was scored as perfect, sufficient or bad. Thereafter, the prior MRI was analyzed. The visibility of the lesion, previously identified in the diagnostic MRI, was rated as visible (BI-RADS 4/5), minimally visible (BI-RADS 2/3), or invisible (BI-RADS 1) (Fig.1). In lesions classified as visible or minimally visible morphology and enhancement characteristics were scored. Pearsons chi-square tests were used to test if imaging, patient, and cancer characteristics affect the visibility of the tumor on the prior MRI. Statistics were performed in SPSS.
+                                                         Results:
+                                                         From January 2003 - January 2014, 10120 MRI-examinations were performed in 2797 women, including 807 BRCA-mutation carriers. In total, 153 cancers were found. For 69 screen-detected tumors a prior MRI was available (36 tumors in patients with a BRCA mutation). In retrospect, 20 (29%) tumors were visible on the prior MRI, 26 (38%) showed a minimal sign, and 23 (33%) were invisible. Furthermore, prior MRIs were also available for 12 interval cancers (6 tumors in patients with a BRCA mutation); 3 (25%) were visible, 4 (33%) showed a minimal sign, and 5 (33%) were not visible on the prior MRI. Tumors in BRCA patients, small tumors, tumors of high grade and hormone-negative tumors were more likely to be invisible on the prior MRI (p<0.001, p=0.039, p<0.001, p<0.001, respectively). The lack of detection of lesions scored as visible on the prior MRI was not related to BPE or IQ.
+                                                          Discussion:
+                                                         A successful MRI screening program is based on the balance between the early detection and the false positive findings that result in unnecessary biopsies and anxiety. This might explain why not all visible lesions get recalled. However, in our study we show that almost one third of cancers were already visible on the prior MRI scan in retrospect and should have been recalled according to our consensus review. This fraction was similar for screen detected and interval cancers. A possible reason for the non-recall could be that the visible lesions were already present at an earlier time point and were regarded stable over time. Non-recall was not related to BPE or IQ.
+                                                         Conclusion:
+                                                         It was seen that 28% of breast cancers should have been recalled earlier based on consensus review. Only 35% was completely invisible in retrospect. This indicates that even highly specialized breast cancer screening programs can still be improved and that regular evaluation of screening practice is essential.
+                                                         References:
+                                                         1.S. Chen et al. Meta-analysis of BRCA1 and BRCA2 penetrance. JCO (2007), 25(11):1329-33
+                                                         2.D. Saslow et al. American Cancer Society guidelines for breast screening with MRI as an adjunct to mammography. CA (2007), 57:75-89
+                                                         3.D.M. Ikeda et al. Analysis of 172 subtle findings on prior normal mammograms in women with breast cancer detected at follow-up screening. Radiology (2003), 226(2):494-503
+                                                         4.A.J. Maxwell et al. A study of breast cancers detected in the incident round of the UK NHS Breast Screening Programme: the importance of early detection and treatment of ductal carcinoma in situ. Breast (2001), 10(5):392-8
+                                                         5.A. Gubern-Merida et al. Automated localization of breast cancer in DCE-MRI. Med Imag Anal (2015),20(1):265-74
+                                                         6.BI-RADS Atlas, 5th ed 2013
+  
+                                                         Acknowledgements: European Unions 7FP (Grant 601040)},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -34009,9 +34045,9 @@ @conference{Vree16a
   booktitle = EBCC,
   year = {2016},
   abstract = {Background: Intensive screening with annual mammography and MRI is offered to women at high risk for the development of breast cancer. Although most cancers are screen detected, screening does not prevent breast cancers from occurring and some are still detected between screening rounds (true interval cancers). Consequently, some women opt for prophylactic mastectomy rather than intensive screening since this reduces the incidence of breast cancer. Unfortunately, detection of cancer in a prophylactic mastectomy specimen (incident cancers) is not a rare occurrence. It is unsure whether these cancers should be considered as interval cancers. This study evaluates the prognostic factors of cancers stratified by the mode of tumor detection in these women.
-                                                       Material and methods: Review of our intermediate and high risk screening program from 2003 to 2013 identified 177 cancers. Of these, 136 were detected in screening, 15 cancers were true interval carcinomas detected due to symptoms, and 26 cancers were detected in prophylactic mastectomy specimens. Patient- and cancer characteristics (invasive versus in-situ disease, grade, pT-stage, age, menopausal state, cancer receptor status and pN-stage) between these three groups were compared using a Pearson's chi-square test for categorical variables or one-way ANOVA for continuous variables.
-                                                       Results: The fraction of invasive disease was 8/26 (30.8%), 109/136 (80.1%) and 15/15 (100%) for cancers in prophylactic mastectomy specimens, screen detected cancers and interval cancers, respectively (p<0.001). The fraction of cancers larger than two centimeters was 1/26 (3.8%), 24/136 (17.6%) and 3/15 (20.0%), respectively. A similar increase was observed for the overall pT-stage (p<0.001). Moreover, tumor grade was higher in true interval cancers than in cancers detected in prophylactic mastectomy specimens (p=0.001). Most cancers were node negative (p=0.233). There were no significant differences in patient age, menopausal state, cancer receptor status, and pN-stage between true interval cancers and prophylactic mastectomy specimens.
-                                                       Conclusions: True interval cancers are more often invasive, generally larger, and commonly of higher grade than screen detected cancers or cancers in prophylactic mastectomy specimens. The prognosis of cancers detected in prophylactic mastectomy specimens is particularly good as most of these lesions are in situ cancers only. Therefore, these incident cancers should not be regarded as interval cancers.},
+                                                         Material and methods: Review of our intermediate and high risk screening program from 2003 to 2013 identified 177 cancers. Of these, 136 were detected in screening, 15 cancers were true interval carcinomas detected due to symptoms, and 26 cancers were detected in prophylactic mastectomy specimens. Patient- and cancer characteristics (invasive versus in-situ disease, grade, pT-stage, age, menopausal state, cancer receptor status and pN-stage) between these three groups were compared using a Pearson's chi-square test for categorical variables or one-way ANOVA for continuous variables.
+                                                         Results: The fraction of invasive disease was 8/26 (30.8%), 109/136 (80.1%) and 15/15 (100%) for cancers in prophylactic mastectomy specimens, screen detected cancers and interval cancers, respectively (p<0.001). The fraction of cancers larger than two centimeters was 1/26 (3.8%), 24/136 (17.6%) and 3/15 (20.0%), respectively. A similar increase was observed for the overall pT-stage (p<0.001). Moreover, tumor grade was higher in true interval cancers than in cancers detected in prophylactic mastectomy specimens (p=0.001). Most cancers were node negative (p=0.233). There were no significant differences in patient age, menopausal state, cancer receptor status, and pN-stage between true interval cancers and prophylactic mastectomy specimens.
+                                                         Conclusions: True interval cancers are more often invasive, generally larger, and commonly of higher grade than screen detected cancers or cancers in prophylactic mastectomy specimens. The prognosis of cancers detected in prophylactic mastectomy specimens is particularly good as most of these lesions are in situ cancers only. Therefore, these incident cancers should not be regarded as interval cancers.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -34021,22 +34057,22 @@ @conference{Vree16b
   booktitle = ECR,
   year = {2016},
   abstract = {Purpose: Women at increased risk for breast cancer are screened with annual MRI and mammography. However,
-                                                       despite this intensive surveillance interval cancers still occur. The purpose of this study is to evaluate prognostic
-                                                       factors of interval carcinomas and compare these to prognostic factors of screen detected cancers.
-                                                       Methods and Materials: In a review of our intermediate and high risk screening program from 2003 to 2013, 170
-                                                       cancers in 159 women were identified. Of these, 14 cancers were true interval carcinomas presenting with symptoms,
-                                                       and 132 were detected in screening. TwentyA-A?A 1/2 four further cancers were detected in prophylactic mastectomy
-                                                       specimens, and were excluded from this study. PatientA-A?A 1/2  and cancer characteristics of screen detected cancers and
-                                                       interval cancers were compared using a Pearson's chiA-A?A 1/2 squared test for categorical variables and a Student's tA-A?A 1/2 test for
-                                                       continuous variables.
-                                                       Results: Interval cancers occurred in younger patients (p=0.001), had a higher pTA-A?A 1/2 stage (p=0.046), and were more
-                                                       often ERA-A?A 1/2 negative and PRA-A?A 1/2 negative (p=0.002, and p=0.002, respectively). Tumor grade appeared worse in interval
-                                                       carcinomas and were more often invasive, but this did not reach statistical significance (p=0.062, and p=0.063,
-                                                       respectively). HER2A-A?A 1/2 status was not significantly different. Fortunately, no difference was observed in pNA-A?A 1/2 stage or
-                                                       presence of metastatic disease.
-                                                       Conclusion: Interval cancers occurring in women participating in intensive surveillance programs are of more
-                                                       aggressive nature than screen detected cancers. However, our results suggest that interval cancers are detected when
-                                                       the disease is local. This still results in a relatively good prognosis for patients with interval cancer.},
+                                                         despite this intensive surveillance interval cancers still occur. The purpose of this study is to evaluate prognostic
+                                                         factors of interval carcinomas and compare these to prognostic factors of screen detected cancers.
+                                                         Methods and Materials: In a review of our intermediate and high risk screening program from 2003 to 2013, 170
+                                                         cancers in 159 women were identified. Of these, 14 cancers were true interval carcinomas presenting with symptoms,
+                                                         and 132 were detected in screening. TwentyA-A?A 1/2 four further cancers were detected in prophylactic mastectomy
+                                                         specimens, and were excluded from this study. PatientA-A?A 1/2  and cancer characteristics of screen detected cancers and
+                                                         interval cancers were compared using a Pearson's chiA-A?A 1/2 squared test for categorical variables and a Student's tA-A?A 1/2 test for
+                                                         continuous variables.
+                                                         Results: Interval cancers occurred in younger patients (p=0.001), had a higher pTA-A?A 1/2 stage (p=0.046), and were more
+                                                         often ERA-A?A 1/2 negative and PRA-A?A 1/2 negative (p=0.002, and p=0.002, respectively). Tumor grade appeared worse in interval
+                                                         carcinomas and were more often invasive, but this did not reach statistical significance (p=0.062, and p=0.063,
+                                                         respectively). HER2A-A?A 1/2 status was not significantly different. Fortunately, no difference was observed in pNA-A?A 1/2 stage or
+                                                         presence of metastatic disease.
+                                                         Conclusion: Interval cancers occurring in women participating in intensive surveillance programs are of more
+                                                         aggressive nature than screen detected cancers. However, our results suggest that interval cancers are detected when
+                                                         the disease is local. This still results in a relatively good prognosis for patients with interval cancer.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -34046,21 +34082,21 @@ @conference{Vree16c
   booktitle = ECR,
   year = {2016},
   abstract = {Purpose: Breast MRI background parenchymal enhancement (BPE) has been identified as a risk factor for breast
-                                                       cancer and has been associated to certain tumor characteristics. However, it is not known whether its presence is
-                                                       related to tumor aggressiveness in high risk screening patients. The purpose of this study is to evaluate this
-                                                       association between BPE and tumor grade in high risk screen detected breast cancers.
-                                                       Methods and Materials: Review of our intermediate and high risk screening program from 2003A-A?A 1/2 2013 identified MRIscans
-                                                       of 80 cancers in 79 patients (48A-A?A 1/2 9.8 years) with biopsy proven unilateral cancer and no previous breast cancer.
-                                                       The level of BPE in the contralateral breast was scored as minimal, mild, moderate, and marked by two readers (one
-                                                       5th year resident (R1) and one experienced radiologist (R2)). Odds ratios (OR) were calculated for grade in relation to
-                                                       BPE. Observer variability was computed using kappa statistics.
-                                                       Results: A significant association was found between tumor grade and level of BPE in the contralateral breast for both
-                                                       readers (the OR for high grade tumor was 0.394 (p=0.007) for R1 and 0.310 (p=0.002) for R2). After adjusting for
-                                                       significant factors, the OR for high grade cancers was 0.924 for R1 and 2.066 for R2. Kappa value for BPE
-                                                       assessment between readers was K=0.592.
-                                                       Conclusion: Lower BPE might be associated to higher tumor grade, when only evaluating BPE. However, our results
-                                                       suggest that other factors play a major role in this association. This limits the usefulness of BPE as a parameter for
-                                                       therapy stratification.},
+                                                         cancer and has been associated to certain tumor characteristics. However, it is not known whether its presence is
+                                                         related to tumor aggressiveness in high risk screening patients. The purpose of this study is to evaluate this
+                                                         association between BPE and tumor grade in high risk screen detected breast cancers.
+                                                         Methods and Materials: Review of our intermediate and high risk screening program from 2003A-A?A 1/2 2013 identified MRIscans
+                                                         of 80 cancers in 79 patients (48A-A?A 1/2 9.8 years) with biopsy proven unilateral cancer and no previous breast cancer.
+                                                         The level of BPE in the contralateral breast was scored as minimal, mild, moderate, and marked by two readers (one
+                                                         5th year resident (R1) and one experienced radiologist (R2)). Odds ratios (OR) were calculated for grade in relation to
+                                                         BPE. Observer variability was computed using kappa statistics.
+                                                         Results: A significant association was found between tumor grade and level of BPE in the contralateral breast for both
+                                                         readers (the OR for high grade tumor was 0.394 (p=0.007) for R1 and 0.310 (p=0.002) for R2). After adjusting for
+                                                         significant factors, the OR for high grade cancers was 0.924 for R1 and 2.066 for R2. Kappa value for BPE
+                                                         assessment between readers was K=0.592.
+                                                         Conclusion: Lower BPE might be associated to higher tumor grade, when only evaluating BPE. However, our results
+                                                         suggest that other factors play a major role in this association. This limits the usefulness of BPE as a parameter for
+                                                         therapy stratification.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -34386,9 +34422,9 @@ @conference{Wand15
   booktitle = ECR,
   year = {2015},
   abstract = {Purpose: We examined to what extent mammographic density affects screening performance when using full field digital mammography (FFDM).
-                                                       Methods and Materials: We collected a consecutive series of 69,874 FFDM examinations (2003-2009) from one screening unit of the Dutch biennial screening program (50-75 years). Volumetric mammographic density was automatically assessed with Volpara version 1.5.0 (Matakina, New Zealand). Recall and breast cancer detection information was obtained from the screening registration system. Interval cancers were identified through linkage with the Netherlands Cancer Registry. Within four density categories, comparable to ACR breast density categories, we determined screening performance measures and linear trends with a Chi Square linear trend test.
-                                                       Results: 19.7% of the examinations was categorised as density category 1 ('almost entirely fatty'), 43.1% as category 2, 29.4% as category 3 and 7.7% as category 4 ('extremely dense'). In total 421 screen-detected and 150 interval tumours were identified. Cancer detection rates were 3.7?, 6.4?, 6.6? and 6.3? in categories 1 to 4 respectively (p=0.005). Interval cancer rates increased with increasing density categories: 0.7?, 1.9?, 3.0? and 4.5?, respectively (p< 0.001). As a result, the sensitivity (proportion of screen-detected tumours of screen-detected and interval tumours) was lower in higher density categories: 85.0%, 77.6%, 69.0% and 58.6% respectively (p< 0.001). The number of false positives was higher in women with dense breasts: 11.4?, 14.1?, 18.3? and 28.6? for categories 1 to 4, respectively (p< 0.001).
-                                                       Conclusion: Also when FFDM is used in breast cancer screening higher interval cancer and false-positive rates are observed in women with mammographically dense breasts.},
+                                                         Methods and Materials: We collected a consecutive series of 69,874 FFDM examinations (2003-2009) from one screening unit of the Dutch biennial screening program (50-75 years). Volumetric mammographic density was automatically assessed with Volpara version 1.5.0 (Matakina, New Zealand). Recall and breast cancer detection information was obtained from the screening registration system. Interval cancers were identified through linkage with the Netherlands Cancer Registry. Within four density categories, comparable to ACR breast density categories, we determined screening performance measures and linear trends with a Chi Square linear trend test.
+                                                         Results: 19.7% of the examinations was categorised as density category 1 ('almost entirely fatty'), 43.1% as category 2, 29.4% as category 3 and 7.7% as category 4 ('extremely dense'). In total 421 screen-detected and 150 interval tumours were identified. Cancer detection rates were 3.7?, 6.4?, 6.6? and 6.3? in categories 1 to 4 respectively (p=0.005). Interval cancer rates increased with increasing density categories: 0.7?, 1.9?, 3.0? and 4.5?, respectively (p< 0.001). As a result, the sensitivity (proportion of screen-detected tumours of screen-detected and interval tumours) was lower in higher density categories: 85.0%, 77.6%, 69.0% and 58.6% respectively (p< 0.001). The number of false positives was higher in women with dense breasts: 11.4?, 14.1?, 18.3? and 28.6? for categories 1 to 4, respectively (p< 0.001).
+                                                         Conclusion: Also when FFDM is used in breast cancer screening higher interval cancer and false-positive rates are observed in women with mammographically dense breasts.},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -34422,9 +34458,9 @@ @conference{Wand16
   booktitle = {Annual conference of the International Agency for Research on Cancer},
   year = {2016},
   abstract = {Purpose: In light of breast density legislation and discussions about supplemental screening it is important to know not only one's risk of breast cancer, but particularly the risk of a tumor that is not detected through mammographic screening. We investigated the relationship between volumetric breast density and the risk of screen-detected and interval cancer within a digital mammography (DM) screening program.
-                                                       Methods: Mammographic density was automatically assessed with Volpara version 1.5.0 (Matakina, New Zealand) on the first available digital mammogram of 43,211 women (50-75 years) participating in the Dutch biennial breast cancer screening program (2003-2009). Screen-detected and interval breast cancer information was obtained from the screening registration system and through linkage with the Netherlands Cancer Registry. We estimated risks of screen-detected and interval cancers in relation to breast density using multinomial logistic regression analysis (adjusted for age). No other confounders were available in this routine screening database.
-                                                       Results: 413 screen-detected and 150 interval tumors were identified. Screen-detected breast cancer risk was significantly higher in the higher breast density categories compared to the lowest (OR: 1.65, 95% CI: 1.21-2.24, OR: 1.78, 95% CI: 1.29-2.47, OR: 1.69, 95% CI: 1.08-2.63, for density categories 2 to 4 respectively compared to 1). Interval cancer risk increased with increasing breast density (OR: 2.45, 95% CI: 1.20-4.99, OR: 5.24, 95% CI: 2.59-10.59 and OR: 6.86, 95% CI: 3.12-15.11, for density categories 2 to 4 respectively compared to 1). The relationship with interval cancers was statistically significantly stronger than with screen-detected cancers (p<0.01) for density categories 3 and 4.
-                                                       Conclusions: Although higher breast density is related to a higher risk of a screen-detected breast cancer, it is particularly strongly related to the risk of a breast cancer that is not detected through mammographic screening (interval cancer).},
+                                                         Methods: Mammographic density was automatically assessed with Volpara version 1.5.0 (Matakina, New Zealand) on the first available digital mammogram of 43,211 women (50-75 years) participating in the Dutch biennial breast cancer screening program (2003-2009). Screen-detected and interval breast cancer information was obtained from the screening registration system and through linkage with the Netherlands Cancer Registry. We estimated risks of screen-detected and interval cancers in relation to breast density using multinomial logistic regression analysis (adjusted for age). No other confounders were available in this routine screening database.
+                                                         Results: 413 screen-detected and 150 interval tumors were identified. Screen-detected breast cancer risk was significantly higher in the higher breast density categories compared to the lowest (OR: 1.65, 95% CI: 1.21-2.24, OR: 1.78, 95% CI: 1.29-2.47, OR: 1.69, 95% CI: 1.08-2.63, for density categories 2 to 4 respectively compared to 1). Interval cancer risk increased with increasing breast density (OR: 2.45, 95% CI: 1.20-4.99, OR: 5.24, 95% CI: 2.59-10.59 and OR: 6.86, 95% CI: 3.12-15.11, for density categories 2 to 4 respectively compared to 1). The relationship with interval cancers was statistically significantly stronger than with screen-detected cancers (p<0.01) for density categories 3 and 4.
+                                                         Conclusions: Although higher breast density is related to a higher risk of a screen-detected breast cancer, it is particularly strongly related to the risk of a breast cancer that is not detected through mammographic screening (interval cancer).},
   optnote = {DIAG, RADIOLOGY},
 }
 
@@ -34459,9 +34495,9 @@ @article{Wand17a
   pages = {67},
   doi = {10.1186/s13058-017-0859-9},
   abstract = {Background In the light of the breast density legislation in the USA, it is important to know a woman's breast cancer risk, but particularly her risk of a tumor that is not detected through mammographic screening (interval cancer). Therefore, we examined the associations of automatically measured volumetric breast density with screen-detected and interval cancer risk, separately.
-                                                       Methods Volumetric breast measures were assessed automatically using Volpara version 1.5.0 (Matakina, New Zealand) for the first available digital mammography (DM) examination of 52,814 women (age 50-75 years) participating in the Dutch biennial breast cancer screening program between 2003 and 2011. Breast cancer information was obtained from the screening registration system and through linkage with the Netherlands Cancer Registry. We excluded all screen-detected breast cancers diagnosed as a result of the first digital screening examination. During a median follow-up period of 4.2 (IQR 2.0-6.2) years, 523 women were diagnosed with breast cancer of which 299 were screen-detected and 224 were interval breast cancers. The associations between volumetric breast measures and breast cancer risk were determined using Cox proportional hazards analyses.
-                                                       Results Percentage dense volume was found to be positively associated with both interval and screen-detected breast cancers (hazard ratio (HR) 8.37 (95% CI 4.34-16.17) and HR 1.39 (95% CI 0.82-2.36), respectively, for Volpara density grade category (VDG) 4 compared to VDG1 (p for heterogeneity < 0.001)). Dense volume (DV) was also found to be positively associated with both interval and screen-detected breast cancers (HR 4.92 (95% CI 2.98-8.12) and HR 2.30 (95% CI 1.39-3.80), respectively, for VDG-like category (C)4 compared to C1 (p for heterogeneity = 0.041)). The association between percentage dense volume categories and interval breast cancer risk (HR 8.37) was not significantly stronger than the association between absolute dense volume categories and interval breast cancer risk (HR 4.92).
-                                                       Conclusions Our results suggest that both absolute dense volume and percentage dense volume are strong markers of breast cancer risk, but that they are even stronger markers for predicting the occurrence of tumors that are not detected during mammography breast cancer screening.},
+                                                         Methods Volumetric breast measures were assessed automatically using Volpara version 1.5.0 (Matakina, New Zealand) for the first available digital mammography (DM) examination of 52,814 women (age 50-75 years) participating in the Dutch biennial breast cancer screening program between 2003 and 2011. Breast cancer information was obtained from the screening registration system and through linkage with the Netherlands Cancer Registry. We excluded all screen-detected breast cancers diagnosed as a result of the first digital screening examination. During a median follow-up period of 4.2 (IQR 2.0-6.2) years, 523 women were diagnosed with breast cancer of which 299 were screen-detected and 224 were interval breast cancers. The associations between volumetric breast measures and breast cancer risk were determined using Cox proportional hazards analyses.
+                                                         Results Percentage dense volume was found to be positively associated with both interval and screen-detected breast cancers (hazard ratio (HR) 8.37 (95% CI 4.34-16.17) and HR 1.39 (95% CI 0.82-2.36), respectively, for Volpara density grade category (VDG) 4 compared to VDG1 (p for heterogeneity < 0.001)). Dense volume (DV) was also found to be positively associated with both interval and screen-detected breast cancers (HR 4.92 (95% CI 2.98-8.12) and HR 2.30 (95% CI 1.39-3.80), respectively, for VDG-like category (C)4 compared to C1 (p for heterogeneity = 0.041)). The association between percentage dense volume categories and interval breast cancer risk (HR 8.37) was not significantly stronger than the association between absolute dense volume categories and interval breast cancer risk (HR 4.92).
+                                                         Conclusions Our results suggest that both absolute dense volume and percentage dense volume are strong markers of breast cancer risk, but that they are even stronger markers for predicting the occurrence of tumors that are not detected during mammography breast cancer screening.},
   file = {:pdf/Wand17a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {28583146},
@@ -34580,7 +34616,7 @@ @conference{Wiel10
   booktitle = ISMRM,
   year = {2010},
   abstract = {Introduction: Tracheomalacia (TM) refers to a weakness of the trachea, frequently due to reduction and/or atrophy of the longitudinal elastic fibers of the pars membranacea, or impaired cartilage integrity, such that the airway is softer and more susceptible to collapse. Various degrees of tracheal collapse, and therefore airway obstruction, can result from this narrowing. Diagnosis of TM includes history and physical examination, e.g. expiratory manoeuvre and cough. Pulmonary function tests include the determination of flow limitations during expiration. However, endoscopy is the essential and invaluable tool and remains the gold standard method for evaluating the airways. From the imaging perspective, conventional radiographs have had a lower sensitivity (62%), and are used in conjunction with endoscopy. A CT-scan is the initial radiologic test in cases of suspect TM. MRI is another imaging possibility for evaluating central airway abnormalities, however, not often used because of severe drawbacks in an area with large magnetic susceptibility gradients, poor signal homogeneity and prone to low spatial resolution and motion artifacts. The majority of papers diagnosis of TM considers imaging during end-inspiration and end-expiration. Nonetheless, more recently, some authors have demonstrated the importance of dynamic CINE acquisitions, indicating that dynamic-MRI studies during coughing may facilitate the evaluation of the collapsibility of the trachea in patients with TM. Purpose:The purpose of this work was to provide: first, a suitable acquisition scenario including static and dynamic 3D MRI sequences with sufficient temporal and spatial resolution to provide good morphological information and visualization of dynamic events in the central airways and, secondly, to provide the means for an automatic analysis program suitable to segment the airway lumen and a dynamic evaluation of cross-sectional areas of the central airways down to the 2nd generation branching. Materials and Methods: 10 healthy adult volunteers between 18 and 50 years of age were recruited as pilot group to optimize image acquisition for the static and dynamic portions of the MRI examination at 1.5T. Volunteers were trained to perform spirometry controlled breathing manoeuvres using a MRI compatible spirometer. Each subject was instructed additionally to perform forced expiration and cough maneuvers. A-A?A 1/2 Static?? 13-second breath-hold scans covering the entire thoracic region were acquired at end-inspiration and endexpiration using a 3D rf-spoiled gradient echo sequence with TR/TE=1.2/0.5 ms, flip angle 2A-A?A 1/2 , sagittal volume acquisition with isotropic (2.8) 3 mm3 voxels. A-A?A 1/2 Dynamic?? scans were performed with the same scan parameters but covering only the central thorax (1/3 volume) with a temporal resolution of 500 ms per volume using the TRICKS (time resolved imaging of contrast kinetics) platform and accelerated imaging options. In-house developed software for segmentation and analysis was used. To initiate the timedomain analysis 3 seeds were placed corresponding to the beginning of the trachea and ends of the left and right primary bronchi to produce a centerline. The lumen is then segmented and a surface created to produce a unique reference frame to ease the timeanalysis (Figure 1). A cross-sectional analysis can then be performed to determine stenosis and distensibility parameters. Likewise, longitudinal and geometrical analyses (e.g., bifurcation angles and planarity) are generated. Results and Discussion: The software tracks the level of the branching automatically and provides a uniquely defined origin per data set thus enabling time comparisons in the same individual and across healthy and patients with TM. The analysis is completely automated (except for three seed points for lumen), providing as output any lumen based parameters that are desired and/or are clinical relevant. With optimized parameter settings the method successfully tracked the central airway paths in all volunteers. Conclusions: The results show that
-                                                       robust and accurate segmentation of the airways is feasible with the acquired MRI datasets. This work is highly relevant for clinical research and practice: automated lumen segmentation in patients with TM (or other related disease of the airways) is the first step for automatic grading of airway malignancy.},
+                                                         robust and accurate segmentation of the airways is feasible with the acquired MRI datasets. This work is highly relevant for clinical research and practice: automated lumen segmentation in patients with TM (or other related disease of the airways) is the first step for automatic grading of airway malignancy.},
   file = {Wiel10.pdf:pdf\\Wiel10.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
 }
@@ -34626,14 +34662,17 @@ @article{Wild23a
 }
 
 @article{Wild23b,
-  author = {de Wilde, Bram and Saha, Anindo and ten Broek, Richard PG and Huisman, Henkjan},
+  author = {de Wilde, Bram and Saha, Anindo and Broek, Richard P. G. ten and Huisman, Henkjan},
   title = {Medical diffusion on a budget: textual inversion for medical image generation},
-  journal = {arXiv:2303.13430},
-  optnote = {DIAG, RADIOLOGY},
+  doi = {10.48550/ARXIV.2303.13430},
   year = {2023},
-  ss_id = {41579777836a07a1bd8b4d8593fcda7983b68e67},
+  abstract = {Diffusion-based models for text-to-image generation have gained immense popularity due to recent advancements in efficiency, accessibility, and quality. Although it is becoming increasingly feasible to perform inference with these systems using consumer-grade GPUs, training them from scratch still requires access to large datasets and significant computational resources. In the case of medical image generation, the availability of large, publicly accessible datasets that include text reports is limited due to legal and ethical concerns. While training a diffusion model on a private dataset may address this issue, it is not always feasible for institutions lacking the necessary computational resources. This work demonstrates that pre-trained Stable Diffusion models, originally trained on natural images, can be adapted to various medical imaging modalities by training text embeddings with textual inversion. In this study, we conducted experiments using medical datasets comprising only 100 samples from three medical modalities. Embeddings were trained in a matter of hours, while still retaining diagnostic relevance in image generation. Experiments were designed to achieve several objectives. Firstly, we fine-tuned the training and inference processes of textual inversion, revealing that larger embeddings and more examples are required. Secondly, we validated our approach by demonstrating a 2\% increase in the diagnostic accuracy (AUC) for detecting prostate cancer on MRI, which is a challenging multi-modal imaging modality, from 0.78 to 0.80. Thirdly, we performed simulations by interpolating between healthy and diseased states, combining multiple pathologies, and inpainting to show embedding flexibility and control of disease appearance. Finally, the embeddings trained in this study are small (less than 1 MB), which facilitates easy sharing of medical data with reduced privacy concerns.},
+  url = {https://arxiv.org/abs/2303.13430},
+  file = {Wild23b.pdf:pdf\\Wild23b.pdf:PDF},
+  optnote = {DIAG, RADIOLOGY},
+  journal = {0},
+  automatic = {yes},
   all_ss_ids = {['41579777836a07a1bd8b4d8593fcda7983b68e67']},
-  gscites = {5},
 }
 
 @article{Wild23c,
@@ -34718,22 +34757,22 @@ @article{Wink21a
   url = {http://dx.doi.org/10.1007/s00330-021-07992-w},
   volume = {31},
   abstract = {Abstract
-                                         Objectives
-                                         Digital breast tomosynthesis (DBT) increases sensitivity of mammography and is increasingly implemented in breast cancer screening. However, the large volume of images increases the risk of reading errors and reading time. This study aims to investigate whether the accuracy of breast radiologists reading wide-angle DBT increases with the aid of an artificial intelligence (AI) support system. Also, the impact on reading time was assessed and the stand-alone performance of the AI system in the detection of malignancies was compared to the average radiologist.
-
-                                         Methods
-                                         A multi-reader multi-case study was performed with 240 bilateral DBT exams (71 breasts with cancer lesions, 70 breasts with benign findings, 339 normal breasts). Exams were interpreted by 18 radiologists, with and without AI support, providing cancer suspicion scores per breast. Using AI support, radiologists were shown examination-based and region-based cancer likelihood scores. Area under the receiver operating characteristic curve (AUC) and reading time per exam were compared between reading conditions using mixed-models analysis of variance.
-
-                                         Results
-                                         On average, the AUC was higher using AI support (0.863 vs 0.833; p = 0.0025). Using AI support, reading time per DBT exam was reduced (p &lt; 0.001) from 41 (95% CI = 39-42 s) to 36 s (95% CI = 35- 37 s). The AUC of the stand-alone AI system was non-inferior to the AUC of the average radiologist (+0.007, p = 0.8115).
-
-                                         Conclusions
-                                         Radiologists improved their cancer detection and reduced reading time when evaluating DBT examinations using an AI reading support system.
-
-                                         Key Points
-                                         * Radiologists improved their cancer detection accuracy in digital breast tomosynthesis (DBT) when using an AI system for support, while simultaneously reducing reading time.
-                                         * The stand-alone breast cancer detection performance of an AI system is non-inferior to the average performance of radiologists for reading digital breast tomosynthesis exams.
-                                         * The use of an AI support system could make advanced and more reliable imaging techniques more accessible and could allow for more cost-effective breast screening programs with DBT.},
+                                           Objectives
+                                           Digital breast tomosynthesis (DBT) increases sensitivity of mammography and is increasingly implemented in breast cancer screening. However, the large volume of images increases the risk of reading errors and reading time. This study aims to investigate whether the accuracy of breast radiologists reading wide-angle DBT increases with the aid of an artificial intelligence (AI) support system. Also, the impact on reading time was assessed and the stand-alone performance of the AI system in the detection of malignancies was compared to the average radiologist.
+  
+                                           Methods
+                                           A multi-reader multi-case study was performed with 240 bilateral DBT exams (71 breasts with cancer lesions, 70 breasts with benign findings, 339 normal breasts). Exams were interpreted by 18 radiologists, with and without AI support, providing cancer suspicion scores per breast. Using AI support, radiologists were shown examination-based and region-based cancer likelihood scores. Area under the receiver operating characteristic curve (AUC) and reading time per exam were compared between reading conditions using mixed-models analysis of variance.
+  
+                                           Results
+                                           On average, the AUC was higher using AI support (0.863 vs 0.833; p = 0.0025). Using AI support, reading time per DBT exam was reduced (p &lt; 0.001) from 41 (95% CI = 39-42 s) to 36 s (95% CI = 35- 37 s). The AUC of the stand-alone AI system was non-inferior to the AUC of the average radiologist (+0.007, p = 0.8115).
+  
+                                           Conclusions
+                                           Radiologists improved their cancer detection and reduced reading time when evaluating DBT examinations using an AI reading support system.
+  
+                                           Key Points
+                                           * Radiologists improved their cancer detection accuracy in digital breast tomosynthesis (DBT) when using an AI system for support, while simultaneously reducing reading time.
+                                           * The stand-alone breast cancer detection performance of an AI system is non-inferior to the average performance of radiologists for reading digital breast tomosynthesis exams.
+                                           * The use of an AI support system could make advanced and more reliable imaging techniques more accessible and could allow for more cost-effective breast screening programs with DBT.},
   all_ss_ids = {[ef605cc8ccceaa047adac1f1d69668fe3993ace3]},
   automatic = {yes},
   citation-count = {36},
@@ -34938,11 +34977,11 @@ @phdthesis{Xie23a
   title = {Deep Learning for Treatment Planning in Chronic Obstructive Pulmonary Diseases},
   url = {https://repository.ubn.ru.nl/bitstream/handle/2066/294845/294845.pdf},
   abstract = {In Chapter 1, we introduced chronic obstructive pulmonary disease (COPD) and gave background information on COPD diagnosis and treatment planning. We described the role of quantitative CT analysis in COPD treatment planning. Furthermore, we provided a short history of image analysis, from applying low-level image processing to deep learning-based CT analysis, explaining the reason behind deep learning prosperity on the road to being data-driven.
-                          In Chapter 2, we presented a novel method using relational two-stage convolu-tion neural networks for segmenting pulmonary lobes in CT images. The proposed method uses a non-local neural network to capture visual and geometric correspondence between high-level convolution features, which represents the relationships between objects and object parts. Our results demonstrate that learning feature correspondence improves the lobe segmentation performance substantially than the baseline on the COPD and the COVID-19 data set.
-                          In Chapter 3, we presented a method for labeling segmental airways given a segmented airway tree. First, we train a convolution neural network to extract features for representing airway branches. Then, these features are iteratively enriched in agraph neural network by collecting information from neighbors, where the graph is based on the airway tree connectivity. Furthermore, we leverage positional information in our graph neural network, where the position of each branch is encoded by its topological distance to a set of anchor branches. As a result, the learned features are structure- and position-aware, contributing to substantially improved branch classification results compared with methods that use only convolution features or standard graph neural networks.
-                          In Chapter 4, we proposed a novel weakly-supervised segmentation framework trained end-to-end, using only image-level supervision. We show that this approach can produce high-resolution segmentation maps without voxel-level annotations.The proposed method substantially outperforms other weakly-supervised methods,although a gap with the fully-supervised performance remains. Our method trained a segmentation network to predict per-image lesion percentage. We made this training possible by proposing an interval regression loss, given only the upper and lower bound of the target percentage, not the exact percentage as supervision. Furthermore, we stabilized the regression training using equivariant regularization. In the refinement process, we proposed an attention neural network module that updated activation maps in one location using nearby activations, acting similar to random walkers, and seeded regional growth in standard post-processing pipelines, yet ours is trained end-to-end.
-                          In Chapter 5, we expanded on the method outlined in Chapter 4 to predict emphysema subtypes. Our proposed algorithm generates high-resolution emphysema segmentation maps to aid in interpreting the prediction process, offering an improved model interpretability compared to the baseline. To predict both subtypes together, we employ the overlapping loss to ensure that each voxel is only assigned to onesubtype (centrilobular or paraseptal). We also use low-attenuation areas in the lung(LAA-950) as visual cues in regression training, providing the network with localized information. Our approach generates categorical visual scores, estimated emphysema percentages, and high-resolution segmentation maps for both centrilobularand paraseptal subtypes, making it more versatile than the baseline approach.
-                          Finally, in Chapter 6, we reflected on this thesis's main findings and contributions.We also analyzed the advances and impact in the field and the existing limitations of the proposed methods. Additionally, we provided a future outlook for research opportunities in the field of deep learning for medical image analysis.},
+                            In Chapter 2, we presented a novel method using relational two-stage convolu-tion neural networks for segmenting pulmonary lobes in CT images. The proposed method uses a non-local neural network to capture visual and geometric correspondence between high-level convolution features, which represents the relationships between objects and object parts. Our results demonstrate that learning feature correspondence improves the lobe segmentation performance substantially than the baseline on the COPD and the COVID-19 data set.
+                            In Chapter 3, we presented a method for labeling segmental airways given a segmented airway tree. First, we train a convolution neural network to extract features for representing airway branches. Then, these features are iteratively enriched in agraph neural network by collecting information from neighbors, where the graph is based on the airway tree connectivity. Furthermore, we leverage positional information in our graph neural network, where the position of each branch is encoded by its topological distance to a set of anchor branches. As a result, the learned features are structure- and position-aware, contributing to substantially improved branch classification results compared with methods that use only convolution features or standard graph neural networks.
+                            In Chapter 4, we proposed a novel weakly-supervised segmentation framework trained end-to-end, using only image-level supervision. We show that this approach can produce high-resolution segmentation maps without voxel-level annotations.The proposed method substantially outperforms other weakly-supervised methods,although a gap with the fully-supervised performance remains. Our method trained a segmentation network to predict per-image lesion percentage. We made this training possible by proposing an interval regression loss, given only the upper and lower bound of the target percentage, not the exact percentage as supervision. Furthermore, we stabilized the regression training using equivariant regularization. In the refinement process, we proposed an attention neural network module that updated activation maps in one location using nearby activations, acting similar to random walkers, and seeded regional growth in standard post-processing pipelines, yet ours is trained end-to-end.
+                            In Chapter 5, we expanded on the method outlined in Chapter 4 to predict emphysema subtypes. Our proposed algorithm generates high-resolution emphysema segmentation maps to aid in interpreting the prediction process, offering an improved model interpretability compared to the baseline. To predict both subtypes together, we employ the overlapping loss to ensure that each voxel is only assigned to onesubtype (centrilobular or paraseptal). We also use low-attenuation areas in the lung(LAA-950) as visual cues in regression training, providing the network with localized information. Our approach generates categorical visual scores, estimated emphysema percentages, and high-resolution segmentation maps for both centrilobularand paraseptal subtypes, making it more versatile than the baseline approach.
+                            Finally, in Chapter 6, we reflected on this thesis's main findings and contributions.We also analyzed the advances and impact in the field and the existing limitations of the proposed methods. Additionally, we provided a future outlook for research opportunities in the field of deep learning for medical image analysis.},
   copromotor = {C. Jacobs},
   file = {Xie23a.pdf:pdf\\Xie23a.pdf:PDF},
   optnote = {DIAG},
@@ -35167,18 +35206,18 @@ @conference{Zeel19
   booktitle = ARVO,
   title = {{EyeNED} workstation: Development of a multi-modal vendor-independent application for annotation, spatial alignment and analysis of retinal images},
   abstract = {Purpose:
-                                                       Researchers and specialists in the field of ophthalmology currently rely on suboptimal vendor-specific software solutions for viewing and annotating retinal images. Our goal was to develop a fully-featured vendor-independent application that allows researchers and specialists to visualize multi-modal retinal images, perform spatial alignment and annotations, and review outputs of artificial intelligence (AI) algorithms.
-
-                                                       Methods:
-                                                       The application consists of a web-based front-end that allows users to analyze baseline and follow-up images in a multi-modal viewer. It communicates with a back-end interface for grader authentication, loading and storing of images and annotation data. Several types of annotation techniques are available, ranging from image-level classification to point-based and region-based lesion-level annotations.
-
-                                                       The user can select color fundus (CF) images, optical coherence tomography (OCT) volumes, infrared (IR) and autofluorescence (AF) images to be shown simultaneously in the viewer. Spatial alignment of the different modalities can be performed using an integrated affine registration method by clicking on corresponding landmarks, after which a synchronized cursor will appear. After several graders have annotated lesions, the application can be used to compare these and create a consensus grading.
-
-                                                       Results :
-                                                       The application was used by graders and researchers in the EyeNED research group. Region based annotations of geographic atrophy were made for 313 studies containing 488 CF images and 68 OCT images; and of drusen in 100 OCT b-scans. Semi-automatic annotation of the area of central retinal atrophy in Stargardt disease was performed for 67 AF images. Point-based annotation was carried out on lesions in 50 CF images of diabetic retinopathy patients. The multimodal viewing and localisation of lesions was perceived as particularly helpful in the grading of lesions and consensus discussions.
-
-                                                       Conclusions :
-                                                       A software solution has been developed to assist researchers and specialists to view and annotate retinal images. The application was successfully used for annotating lesions in various imaging modalities, facilitating the grading of images in large studies and the collection of annotations for AI solutions.},
+                                                         Researchers and specialists in the field of ophthalmology currently rely on suboptimal vendor-specific software solutions for viewing and annotating retinal images. Our goal was to develop a fully-featured vendor-independent application that allows researchers and specialists to visualize multi-modal retinal images, perform spatial alignment and annotations, and review outputs of artificial intelligence (AI) algorithms.
+  
+                                                         Methods:
+                                                         The application consists of a web-based front-end that allows users to analyze baseline and follow-up images in a multi-modal viewer. It communicates with a back-end interface for grader authentication, loading and storing of images and annotation data. Several types of annotation techniques are available, ranging from image-level classification to point-based and region-based lesion-level annotations.
+  
+                                                         The user can select color fundus (CF) images, optical coherence tomography (OCT) volumes, infrared (IR) and autofluorescence (AF) images to be shown simultaneously in the viewer. Spatial alignment of the different modalities can be performed using an integrated affine registration method by clicking on corresponding landmarks, after which a synchronized cursor will appear. After several graders have annotated lesions, the application can be used to compare these and create a consensus grading.
+  
+                                                         Results :
+                                                         The application was used by graders and researchers in the EyeNED research group. Region based annotations of geographic atrophy were made for 313 studies containing 488 CF images and 68 OCT images; and of drusen in 100 OCT b-scans. Semi-automatic annotation of the area of central retinal atrophy in Stargardt disease was performed for 67 AF images. Point-based annotation was carried out on lesions in 50 CF images of diabetic retinopathy patients. The multimodal viewing and localisation of lesions was perceived as particularly helpful in the grading of lesions and consensus discussions.
+  
+                                                         Conclusions :
+                                                         A software solution has been developed to assist researchers and specialists to view and annotate retinal images. The application was successfully used for annotating lesions in various imaging modalities, facilitating the grading of images in large studies and the collection of annotations for AI solutions.},
   optnote = {DIAG, RADIOLOGY},
   year = {2019},
   gsid = {5177328248453722349},
@@ -35215,16 +35254,16 @@ @article{Zels15
   pages = {1489--1496},
   doi = {10.1016/j.acra.2015.08.006},
   abstract = {RATIONALE AND OBJECTIVES:
-                                                       To investigate the value of multiplanar reconstructions (MPRs) of automated three-dimensional (3D) breast ultrasound (ABUS) compared to transverse evaluation only, in differentiation of benign and malignant breast lesions.
-
-                                                       MATERIALS AND METHODS:
-                                                       Five breast radiologists evaluated ABUS scans of 96 female patients with biopsy-proven abnormalities (36 malignant and 60 benign). They classified the most suspicious lesion based on the breast imaging reporting and data system (BI-RADS) lexicon using the transverse scans only. A likelihood-of-malignancy (LOM) score (0-100) and a BI-RADS final assessment were assigned. Thereafter, the MPR was provided and readers scored the cases again. In addition, they rated the presence of spiculation and retraction in the coronal plane on a five-point scale called Spiculation and Retraction Severity Index (SRSI). Reader performance was analyzed with receiver-operating characteristics analysis.
-
-                                                       RESULTS:
-                                                       The area under the curve increased from 0.82 to 0.87 (P = .01) after readers were shown the reconstructed planes. The SRSI scores are highly correlated (Spearman's r) with the final LOM scores (range, r = 0.808-0.872) and DLOM scores (range, r = 0.525-0.836). Readers downgraded 3%-18% of the biopsied benign lesions to BI-RADS 2 after MPR evaluation. Inter-reader agreement for SRSI was substantial (intraclass correlation coefficient, 0.617). Inter-reader agreement of the BI-RADS final assessment improved from 0.367 to 0.536 after MPRs were read.
-
-                                                       CONCLUSIONS:
-                                                       Full 3D evaluation of ABUS using MPR improves differentiation of breast lesions in comparison to evaluating only transverse planes. Results suggest that the added value of MPR might be related to visualization of spiculation and retraction patterns in the coronal reconstructions.},
+                                                         To investigate the value of multiplanar reconstructions (MPRs) of automated three-dimensional (3D) breast ultrasound (ABUS) compared to transverse evaluation only, in differentiation of benign and malignant breast lesions.
+  
+                                                         MATERIALS AND METHODS:
+                                                         Five breast radiologists evaluated ABUS scans of 96 female patients with biopsy-proven abnormalities (36 malignant and 60 benign). They classified the most suspicious lesion based on the breast imaging reporting and data system (BI-RADS) lexicon using the transverse scans only. A likelihood-of-malignancy (LOM) score (0-100) and a BI-RADS final assessment were assigned. Thereafter, the MPR was provided and readers scored the cases again. In addition, they rated the presence of spiculation and retraction in the coronal plane on a five-point scale called Spiculation and Retraction Severity Index (SRSI). Reader performance was analyzed with receiver-operating characteristics analysis.
+  
+                                                         RESULTS:
+                                                         The area under the curve increased from 0.82 to 0.87 (P = .01) after readers were shown the reconstructed planes. The SRSI scores are highly correlated (Spearman's r) with the final LOM scores (range, r = 0.808-0.872) and DLOM scores (range, r = 0.525-0.836). Readers downgraded 3%-18% of the biopsied benign lesions to BI-RADS 2 after MPR evaluation. Inter-reader agreement for SRSI was substantial (intraclass correlation coefficient, 0.617). Inter-reader agreement of the BI-RADS final assessment improved from 0.367 to 0.536 after MPRs were read.
+  
+                                                         CONCLUSIONS:
+                                                         Full 3D evaluation of ABUS using MPR improves differentiation of breast lesions in comparison to evaluating only transverse planes. Results suggest that the added value of MPR might be related to visualization of spiculation and retraction patterns in the coronal reconstructions.},
   file = {Zels15.pdf:pdf\\Zels15.pdf:PDF},
   optnote = {DIAG},
   publisher = {Elsevier},
@@ -35375,7 +35414,7 @@ @article{Zels19a
   pages = {312-320},
   doi = {10.1177/0284185119858051},
   abstract = {Background: Computer-aided detection software for automated breast ultrasound has been shown to have potential in improving the accuracy of radiologists. Alternative ways of implementing computer-aided detection, such as independent validation or preselecting suspicious cases, might also improve radiologists' accuracy.
-                                                       Purpose: To investigate the effect of using computer-aided detection software to improve the performance of radiologists by validating findings reported by radiologists during screening with automated breast ultrasound. Material and Methods: Unilateral automated breast ultrasound exams were performed in 120 women with dense breasts that included 60 randomly selected normal exams, 30 exams with benign lesions, and 30 malignant cases (20 mammography-negative). Eight radiologists were instructed to detect breast cancer and rate lesions using BI-RADS and level-of-suspiciousness scores. Computer-aided detection software was used to check the validity of radiologists' findings. Findings found negative by computer-aided detection were not included in the readers' performance analysis; however, the nature of these findings were further analyzed. The area under the curve and the partial area under the curve for an interval in the range of 80%-100% specificity before and after validation of computer-aided detection were compared. Sensitivity was computed for all readers at a simulation of 90% specificity. Results: Partial AUC improved significantly from 0.126 (95% confidence interval [CI] = 0.098-0.153) to 0.142 (95% CI = 0.115-0.169) (P = 0.037) after computer-aided detection rejected mostly benign lesions and normal tissue scored BI-RADS 3 or 4. The full areas under the curve (0.823 vs. 0.833, respectively) were not significantly different (P = 0.743). Four cancers detected by readers were completely missed by computer-aided detection and four other cancers were detected by both readers and computer-aided detection but falsely rejected due to technical limitations of our implementation of computer-aided detection validation. In this study, validation of computer-aided detection discarded 42.6% of findings that were scored BI-RADS >=3 by the radiologists, of which 85.5% were non-malignant findings.Conclusion: Validation of radiologists' findings using computer-aided detection software for automated breast ultrasound has the potential to improve the performance of radiologists. Validation of computer-aided detection might be an efficient tool for double-reading strategies by limiting the amount of discordant cases needed to be double-read.},
+                                                         Purpose: To investigate the effect of using computer-aided detection software to improve the performance of radiologists by validating findings reported by radiologists during screening with automated breast ultrasound. Material and Methods: Unilateral automated breast ultrasound exams were performed in 120 women with dense breasts that included 60 randomly selected normal exams, 30 exams with benign lesions, and 30 malignant cases (20 mammography-negative). Eight radiologists were instructed to detect breast cancer and rate lesions using BI-RADS and level-of-suspiciousness scores. Computer-aided detection software was used to check the validity of radiologists' findings. Findings found negative by computer-aided detection were not included in the readers' performance analysis; however, the nature of these findings were further analyzed. The area under the curve and the partial area under the curve for an interval in the range of 80%-100% specificity before and after validation of computer-aided detection were compared. Sensitivity was computed for all readers at a simulation of 90% specificity. Results: Partial AUC improved significantly from 0.126 (95% confidence interval [CI] = 0.098-0.153) to 0.142 (95% CI = 0.115-0.169) (P = 0.037) after computer-aided detection rejected mostly benign lesions and normal tissue scored BI-RADS 3 or 4. The full areas under the curve (0.823 vs. 0.833, respectively) were not significantly different (P = 0.743). Four cancers detected by readers were completely missed by computer-aided detection and four other cancers were detected by both readers and computer-aided detection but falsely rejected due to technical limitations of our implementation of computer-aided detection validation. In this study, validation of computer-aided detection discarded 42.6% of findings that were scored BI-RADS >=3 by the radiologists, of which 85.5% were non-malignant findings.Conclusion: Validation of radiologists' findings using computer-aided detection software for automated breast ultrasound has the potential to improve the performance of radiologists. Validation of computer-aided detection might be an efficient tool for double-reading strategies by limiting the amount of discordant cases needed to be double-read.},
   file = {Zels19a.pdf:pdf\\Zels19a.pdf:PDF},
   optnote = {DIAG, RADIOLOGY},
   pmid = {31324132},
@@ -35402,17 +35441,22 @@ @article{Zhai22
   gscites = {1},
 }
 
-@article{Zhou20,
-  author = {S. Kevin Zhou and Hayit Greenspan and Christos Davatzikos and James S. Duncan and Bram van Ginneken and Anant Madabhushi and Jerry L. Prince and Daniel Rueckert and Ronald M. Summers},
-  title = {A review of deep learning in medical imaging: Image traits, technology trends, case studies with progress highlights, and future promises},
-  journal = {arXiv:2008.09104},
-  year = {2020},
-  abstract = {Since its renaissance, deep learning has been widely used in various medical imaging tasks and has achieved remarkable success in many medical imaging applications, thereby propelling us into the so-called artificial intelligence (AI) era. It is known that the success of AI is mostly attributed to the availability of big data with annotations for a single task and the advances in high performance computing. However, medical imaging presents unique challenges that confront deep learning approaches. In this survey paper, we first highlight both clinical needs and technical challenges in medical imaging and describe how emerging trends in deep learning are addressing these issues. We cover the topics of network architecture, sparse and noisy labels, federating learning, interpretability, uncertainty quantification, etc. Then, we present several case studies that are commonly found in clinical practice, including digital pathology and chest, brain, cardiovascular, and abdominal imaging. Rather than presenting an exhaustive literature survey, we instead describe some prominent research highlights related to these case study applications. We conclude with a discussion and presentation of promising future directions.},
-  file = {:http\://arxiv.org/pdf/2008.09104v1:PDF},
-  optnote = {DIAG},
-  ss_id = {4043785dacd1c04ed93ec1c08ecf779f4e1717fc},
+@article{Zhou21,
+  author = {Zhou, S. Kevin and Greenspan, Hayit and Davatzikos, Christos and Duncan, James S. and Van Ginneken, Bram and Madabhushi, Anant and Prince, Jerry L. and Rueckert, Daniel and Summers, Ronald M.},
+  title = {A Review of Deep Learning in Medical Imaging: Imaging Traits, Technology Trends, Case Studies With Progress Highlights, and Future Promises},
+  doi = {10.1109/jproc.2021.3054390},
+  year = {2021},
+  abstract = {Since its renaissance, deep learning has been widely used in various medical imaging tasks and has achieved remarkable success in many medical imaging applications, thereby propelling us into the so-called artificial intelligence (AI) era. It is known that the success of AI is mostly attributed to the availability of big data with annotations for a single task and the advances in high performance computing. However, medical imaging presents unique challenges that confront deep learning approaches. In this survey paper, we first present traits of medical imaging, highlight both clinical needs and technical challenges in medical imaging, and describe how emerging trends in deep learning are addressing these issues. We cover the topics of network architecture, sparse and noisy labels, federating learning, interpretability, uncertainty quantification, etc. Then, we present several case studies that are commonly found in clinical practice, including digital pathology and chest, brain, cardiovascular, and abdominal imaging. Rather than presenting an exhaustive literature survey, we instead describe some prominent research highlights related to these case study applications. We conclude with a discussion and presentation of promising future directions.},
+  url = {http://dx.doi.org/10.1109/jproc.2021.3054390},
+  file = {Zhou21.pdf:pdf\\Zhou21.pdf:PDF},
+  optnote = {DIAG, RADIOLOGY},
+  journal = {Proceedings of the IEEE},
+  automatic = {yes},
   all_ss_ids = {['4043785dacd1c04ed93ec1c08ecf779f4e1717fc']},
-  gscites = {406},
+  citation-count = {418},
+  pages = {820-838},
+  volume = {109},
+  pmid = {37786449},
 }
 
 @article{Zoet23,