Skip to content

Commit

Permalink
BCDA-8146: Make file server content-encoding aware (#948)
Browse files Browse the repository at this point in the history
## 🎫 Ticket

https://jira.cms.gov/browse/BCDA-8146

## 🛠 Changes

Added magic number check for files, to determine if it is gzip encoded
(or not).

## ℹ️ Context for reviewers

As part of BCDA's gzip epic, we are aiming to simultaneously decrease
costs on EFS, while also increasing the overall throughput of files.
When we switch over from primarily storing content uncompressed to
primarily storing data gzip-encoded, there will be at least some period
of time where both encodings are present. The BCDA application will now
check the magic number
(https://en.wikipedia.org/wiki/Magic_number_(programming)) to determine
if content is encoded or not, and will handle the data appropriately
(decompressing the file if an Accept-Encoding header of gzip is not
present).

There is a binary file,
shared_files/gzip_feature_test/corrupt_gz_file.ndjson, that is not able
to be opened with a normal text editor. It's advised to open files of
this type in a hex editor for manual inspection. In this case, the file
has the magic number of a gzip file, but the subsequent data is not
compliant.

## ✅ Acceptance Validation

Added unit tests to handle the various cases. 

## 🔒 Security Implications

- [ ] This PR adds a new software dependency or dependencies.
- [ ] This PR modifies or invalidates one or more of our security
controls.
- [ ] This PR stores or transmits data that was not stored or
transmitted before.
- [ ] This PR requires additional review of its security implications
for other reasons.

If any security implications apply, add Jason Ashbaugh (GitHub username:
StewGoin) as a reviewer and do not merge this PR without his approval.
  • Loading branch information
alex-dzeda committed Jun 3, 2024
1 parent 7ee0535 commit 40c68ec
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 14 deletions.
71 changes: 67 additions & 4 deletions bcda/api/v1/api.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package v1

import (
"bytes"
"compress/gzip"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"os"
"strings"
"time"

Expand Down Expand Up @@ -284,7 +287,12 @@ func ServeData(w http.ResponseWriter, r *http.Request) {
dataDir := conf.GetEnv("FHIR_PAYLOAD_DIR")
fileName := chi.URLParam(r, "fileName")
jobID := chi.URLParam(r, "jobID")
w.Header().Set(constants.ContentType, "application/fhir+ndjson")
filePath := fmt.Sprintf("%s/%s/%s", dataDir, jobID, fileName)

encoded, err := isGzipEncoded(filePath)
if err != nil {
writeServeDataFailure(err, w)
}

var useGZIP bool
for _, header := range r.Header.Values("Accept-Encoding") {
Expand All @@ -293,20 +301,75 @@ func ServeData(w http.ResponseWriter, r *http.Request) {
break
}
}

w.Header().Set(constants.ContentType, "application/fhir+ndjson")
if useGZIP {
w.Header().Set("Content-Encoding", "gzip")
gz := gzip.NewWriter(w)
defer gz.Close()

gzw := gzipResponseWriter{Writer: gz, ResponseWriter: w}
http.ServeFile(gzw, r, fmt.Sprintf("%s/%s/%s", dataDir, jobID, fileName))
if encoded {
http.ServeFile(w, r, filePath)
} else {
http.ServeFile(gzw, r, filePath)
}

} else {
log.API.Warnf("API request to serve data is being made without gzip for file %s for jobId %s", fileName, jobID)
http.ServeFile(w, r, fmt.Sprintf("%s/%s/%s", dataDir, jobID, fileName))
if encoded {
//We'll do the following: 1. Open file, 2. De-compress it, 3. Serve it up.
file, err := os.Open(filePath) // #nosec G304
if err != nil {
writeServeDataFailure(err, w)
return
}
defer file.Close() //#nosec G307
gzipReader, err := gzip.NewReader(file)
if err != nil {
writeServeDataFailure(err, w)
return
}
defer gzipReader.Close()
_, err = io.Copy(w, gzipReader) // #nosec G110
if err != nil {
writeServeDataFailure(err, w)
return
}
} else {
http.ServeFile(w, r, filePath)
}
}
}

// This function is not necessary, but helps meet the sonarQube quality gates
func writeServeDataFailure(err error, w http.ResponseWriter) {
log.API.Error(err)
http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
}

// This function reads a file's magic number, to determine if it is gzip-encoded or not.
func isGzipEncoded(filePath string) (encoded bool, err error) {
file, err := os.Open(filePath) // #nosec G304
if err != nil {
return false, err
}
defer file.Close() //#nosec G307

byteSlice := make([]byte, 2)
bytesRead, err := file.Read(byteSlice)
if err != nil {
return false, err
}

//We can't compare to a magic number if there's less than 2 bytes returned. Also can't be ndjson
if bytesRead < 2 {
return false, errors.New("Invalid file with length 1 byte.")
}

comparison := []byte{0x1f, 0x8b}
return bytes.Equal(comparison, byteSlice), nil
}

/*
swagger:route GET /api/v1/metadata metadata metadata
Expand Down
27 changes: 17 additions & 10 deletions bcda/api/v1/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,28 +354,30 @@ func (s *APITestSuite) TestDeleteJob() {
}

func (s *APITestSuite) TestServeData() {
conf.SetEnv(s.T(), "FHIR_PAYLOAD_DIR", "../../../bcdaworker/data/test")
conf.SetEnv(s.T(), "FHIR_PAYLOAD_DIR", "../../../shared_files/gzip_feature_test/")

tests := []struct {
name string
headers []string
gzipExpected bool
fileName string
validFile bool
}{
{"gzip-only", []string{"gzip"}, true},
{"gzip-deflate", []string{"gzip, deflate"}, true},
{"gzip-deflate-br-handle", []string{"gzip,deflate, br"}, true},
{"gzip", []string{"deflate", "br", "gzip"}, true},
{"non-gzip w/ deflate and br", []string{"deflate", "br"}, false},
{"non-gzip", nil, false},
{"no-header-gzip-encoded", []string{""}, false, "test_gzip_encoded.ndjson", true},
{"yes-header-gzip-encoded", []string{"gzip"}, true, "test_gzip_encoded.ndjson", true},
{"yes-header-not-encoded", []string{"gzip"}, true, "test_no_encoding.ndjson", true},
{"bad file name", []string{""}, false, "not_a_real_file", false},
{"single byte file", []string{""}, false, "single_byte_file.bin", false},
{"no-header-corrupt-file", []string{""}, false, "corrupt_gz_file.ndjson", false}, //This file is kind of cool. has magic number, but otherwise arbitrary data.
}

for _, tt := range tests {
s.T().Run(tt.name, func(t *testing.T) {
defer s.SetupTest()
req := httptest.NewRequest("GET", "/data/test.ndjson", nil)
req := httptest.NewRequest("GET", fmt.Sprintf("/data/%s", tt.fileName), nil)

rctx := chi.NewRouteContext()
rctx.URLParams.Add("fileName", "test.ndjson")
rctx.URLParams.Add("fileName", tt.fileName)
req = req.WithContext(context.WithValue(req.Context(), chi.RouteCtxKey, rctx))

var useGZIP bool
Expand All @@ -390,6 +392,11 @@ func (s *APITestSuite) TestServeData() {
handler := http.HandlerFunc(ServeData)
handler.ServeHTTP(s.rr, req)

if !tt.validFile {
assert.Equal(t, http.StatusInternalServerError, s.rr.Code)
return
}

assert.Equal(t, http.StatusOK, s.rr.Code)
assert.Equal(t, "application/fhir+ndjson", s.rr.Result().Header.Get(constants.ContentType))

Expand All @@ -407,7 +414,7 @@ func (s *APITestSuite) TestServeData() {
b = s.rr.Body.Bytes()
}

assert.Contains(t, string(b), `{"resourceType": "Bundle", "total": 33, "entry": [{"resource": {"status": "active", "diagnosis": [{"diagnosisCodeableConcept": {"coding": [{"system": "http://hl7.org/fhir/sid/icd-9-cm", "code": "2113"}]},`)
assert.Contains(t, string(b), `{"billablePeriod":{"end":"2016-10-30","start":"2016-10-30"},"contained":[{"birthDate":"1953-05-18","gender":"male","id":"patient","identifier":[{"system":"http://hl7.org/fhir/sid/us-mbi","type":{"coding":[{"code":"MC","display":"Patient's Medicare Number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"1S00E00KW61"}],"name":[{"family":"Quitzon246","given":["Rodolfo763"],"text":"Rodolfo763 Quitzon246 ([max 10 chars of first], [max 15 chars of last])"}],"resourceType":"Patient"},{"id":"provider-org","identifier":[{"system":"https://bluebutton.cms.gov/resources/variables/fiss/meda-prov-6","type":{"coding":[{"code":"PRN","display":"Provider number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"450702"},{"system":"https://bluebutton.cms.gov/resources/variables/fiss/fed-tax-nb","type":{"coding":[{"code":"TAX","display":"Tax ID number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"XX-XXXXXXX"},{"system":"http://hl7.org/fhir/sid/us-npi","type":{"coding":[{"code":"npi","display":"National Provider Identifier","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBIdentifierType"}]},"value":"8884381863"}],"resourceType":"Organization"}],"created":"2024-04-19T11:37:21-04:00","diagnosis":[{"diagnosisCodeableConcept":{"coding":[{"code":"P292","system":"http://hl7.org/fhir/sid/icd-10-cm"}]},"sequence":0}],"extension":[{"url":"https://bluebutton.cms.gov/resources/variables/fiss/serv-typ-cd","valueCoding":{"code":"2","system":"https://bluebutton.cms.gov/resources/variables/fiss/serv-typ-cd"}}],"facility":{"extension":[{"url":"https://bluebutton.cms.gov/resources/variables/fiss/lob-cd","valueCoding":{"code":"2","system":"https://bluebutton.cms.gov/resources/variables/fiss/lob-cd"}}]},"id":"f-LTEwMDE5NTYxNA","identifier":[{"system":"https://bluebutton.cms.gov/resources/variables/fiss/dcn","type":{"coding":[{"code":"uc","display":"Unique Claim ID","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBIdentifierType"}]},"value":"dcn-100195614"}],"insurance":[{"coverage":{"identifier":{"system":"https://bluebutton.cms.gov/resources/variables/fiss/payers-name","value":"MEDICARE"}},"focal":true,"sequence":0}],"meta":{"lastUpdated":"2023-04-13T18:18:27.334-04:00"},"patient":{"reference":"#patient"},"priority":{"coding":[{"code":"normal","display":"Normal","system":"http://terminology.hl7.org/CodeSystem/processpriority"}]},"provider":{"reference":"#provider-org"},"resourceType":"Claim","status":"active","supportingInfo":[{"category":{"coding":[{"code":"typeofbill","display":"Type of Bill","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBSupportingInfoType"}]},"code":{"coding":[{"code":"1","system":"https://bluebutton.cms.gov/resources/variables/fiss/freq-cd"}]},"sequence":1}],"total":{"currency":"USD","value":639.66},"type":{"coding":[{"code":"institutional","display":"Institutional","system":"http://terminology.hl7.org/CodeSystem/claim-type"}]},"use":"claim"}`)

})
}
Expand Down
Binary file not shown.
1 change: 1 addition & 0 deletions shared_files/gzip_feature_test/single_byte_file.bin
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Binary file not shown.
1 change: 1 addition & 0 deletions shared_files/gzip_feature_test/test_no_encoding.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"billablePeriod":{"end":"2016-10-30","start":"2016-10-30"},"contained":[{"birthDate":"1953-05-18","gender":"male","id":"patient","identifier":[{"system":"http://hl7.org/fhir/sid/us-mbi","type":{"coding":[{"code":"MC","display":"Patient's Medicare Number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"1S00E00KW61"}],"name":[{"family":"Quitzon246","given":["Rodolfo763"],"text":"Rodolfo763 Quitzon246 ([max 10 chars of first], [max 15 chars of last])"}],"resourceType":"Patient"},{"id":"provider-org","identifier":[{"system":"https://bluebutton.cms.gov/resources/variables/fiss/meda-prov-6","type":{"coding":[{"code":"PRN","display":"Provider number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"450702"},{"system":"https://bluebutton.cms.gov/resources/variables/fiss/fed-tax-nb","type":{"coding":[{"code":"TAX","display":"Tax ID number","system":"http://terminology.hl7.org/CodeSystem/v2-0203"}]},"value":"XX-XXXXXXX"},{"system":"http://hl7.org/fhir/sid/us-npi","type":{"coding":[{"code":"npi","display":"National Provider Identifier","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBIdentifierType"}]},"value":"8884381863"}],"resourceType":"Organization"}],"created":"2024-04-19T11:37:21-04:00","diagnosis":[{"diagnosisCodeableConcept":{"coding":[{"code":"P292","system":"http://hl7.org/fhir/sid/icd-10-cm"}]},"sequence":0}],"extension":[{"url":"https://bluebutton.cms.gov/resources/variables/fiss/serv-typ-cd","valueCoding":{"code":"2","system":"https://bluebutton.cms.gov/resources/variables/fiss/serv-typ-cd"}}],"facility":{"extension":[{"url":"https://bluebutton.cms.gov/resources/variables/fiss/lob-cd","valueCoding":{"code":"2","system":"https://bluebutton.cms.gov/resources/variables/fiss/lob-cd"}}]},"id":"f-LTEwMDE5NTYxNA","identifier":[{"system":"https://bluebutton.cms.gov/resources/variables/fiss/dcn","type":{"coding":[{"code":"uc","display":"Unique Claim ID","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBIdentifierType"}]},"value":"dcn-100195614"}],"insurance":[{"coverage":{"identifier":{"system":"https://bluebutton.cms.gov/resources/variables/fiss/payers-name","value":"MEDICARE"}},"focal":true,"sequence":0}],"meta":{"lastUpdated":"2023-04-13T18:18:27.334-04:00"},"patient":{"reference":"#patient"},"priority":{"coding":[{"code":"normal","display":"Normal","system":"http://terminology.hl7.org/CodeSystem/processpriority"}]},"provider":{"reference":"#provider-org"},"resourceType":"Claim","status":"active","supportingInfo":[{"category":{"coding":[{"code":"typeofbill","display":"Type of Bill","system":"http://hl7.org/fhir/us/carin-bb/CodeSystem/C4BBSupportingInfoType"}]},"code":{"coding":[{"code":"1","system":"https://bluebutton.cms.gov/resources/variables/fiss/freq-cd"}]},"sequence":1}],"total":{"currency":"USD","value":639.66},"type":{"coding":[{"code":"institutional","display":"Institutional","system":"http://terminology.hl7.org/CodeSystem/claim-type"}]},"use":"claim"}

0 comments on commit 40c68ec

Please sign in to comment.