Support parent extraction from header blocks

Signed-off-by: Alper Rifat Ulucinar <ulucinar@users.noreply.github.com>
crossplane · Sep 26, 2022 · ddc7f3d · ddc7f3d
1 parent 4bae257
commit ddc7f3d
Show file tree

Hide file tree

Showing 5 changed files with 229 additions and 160 deletions.
diff --git a/pkg/registry/meta.go b/pkg/registry/meta.go
@@ -12,6 +12,7 @@ import (
 	"io/ioutil"
 	"path/filepath"
 	"regexp"
+	"sort"
 	"strings"
 
 	"github.com/antchfx/htmlquery"
@@ -34,7 +35,8 @@ const (
 )
 
 var (
-	regexConfigurationBlock = regexp.MustCompile(`block.*support`)
+	regexConfigurationBlock = regexp.MustCompile(`block.*(support)?`)
+	regexHeaderNode         = regexp.MustCompile(`h\d`)
 )
 
 // NewProviderMetadata initializes a new ProviderMetadata for
@@ -236,7 +238,6 @@ func (r *Resource) scrapePrelude(doc *html.Node, preludeXPath string) error {
 }
 
 func (r *Resource) scrapeFieldDocs(doc *html.Node, fieldXPath string) {
-	conflictedFields := make(map[string]bool)
 	processed := make(map[*html.Node]struct{})
 	codeNodes := htmlquery.Find(doc, fieldXPath)
 	for _, n := range codeNodes {
@@ -249,31 +250,61 @@ func (r *Resource) scrapeFieldDocs(doc *html.Node, fieldXPath string) {
 			r.ArgumentDocs = make(map[string]string)
 		}
 		if r.ArgumentDocs[attrName] != "" && r.ArgumentDocs[attrName] != strings.TrimSpace(docStr) {
-			conflictedFields[attrName] = true
 			continue
 		}
 		r.ArgumentDocs[attrName] = strings.TrimSpace(docStr)
 	}
-
-	// Remove descriptions for repeating fields in the registry.
-	for cf := range conflictedFields {
-		delete(r.ArgumentDocs, cf)
-	}
 }
 
-func getRootPath(n *html.Node) string { // nolint: gocyclo
-	var ulNode, pNode, codeNode *html.Node
+func (r *Resource) getRootPath(n *html.Node) string {
+	var ulNode, pNode *html.Node
 	for ulNode = n.Parent; ulNode != nil && ulNode.Data != "ul"; ulNode = ulNode.Parent {
 	}
 	if ulNode == nil {
 		return ""
 	}
-	for pNode = ulNode.PrevSibling; pNode != nil && (pNode.Data != "p" || !checkBlockParagraph(pNode)); pNode = pNode.PrevSibling {
-		// intentionally left empty
+	for pNode = ulNode.PrevSibling; pNode != nil && (pNode.Data != "p" || !regexConfigurationBlock.MatchString(strings.ToLower(extractText(pNode)))); pNode = pNode.PrevSibling {
+		if regexHeaderNode.MatchString(pNode.Data) {
+			return r.extractRootFromHeader(pNode)
+		}
 	}
 	if pNode == nil {
 		return ""
 	}
+	return r.extractRootFromParagraph(pNode)
+}
+
+func (r *Resource) extractRootFromHeader(pNode *html.Node) string {
+	headerText := extractText(pNode)
+	if _, ok := r.ArgumentDocs[headerText]; ok {
+		return headerText
+	}
+	sortedKeys := make([]string, 0, len(r.ArgumentDocs))
+	for k := range r.ArgumentDocs {
+		sortedKeys = append(sortedKeys, k)
+	}
+	sort.Strings(sortedKeys)
+	for _, k := range sortedKeys {
+		parts := strings.Split(k, ".")
+		if headerText == parts[len(parts)-1] {
+			return k
+		}
+	}
+	if _, ok := r.ArgumentDocs[strings.ReplaceAll(headerText, " ", ".")]; ok {
+		return strings.ReplaceAll(headerText, " ", ".")
+	}
+	if regexConfigurationBlock.MatchString(strings.ToLower(extractText(pNode))) {
+		for _, s := range strings.Split(headerText, " ") {
+			if _, ok := r.ArgumentDocs[s]; ok {
+				return s
+			}
+		}
+	}
+	return ""
+}
+
+func (r *Resource) extractRootFromParagraph(pNode *html.Node) string {
+	var codeNode *html.Node
 	for codeNode = pNode.FirstChild; codeNode != nil && codeNode.Data != "code"; codeNode = codeNode.NextSibling {
 		// intentionally left empty
 	}
@@ -284,7 +315,7 @@ func getRootPath(n *html.Node) string { // nolint: gocyclo
 	if prevLiNode == nil {
 		return codeNode.FirstChild.Data
 	}
-	root := getRootPath(prevLiNode)
+	root := r.getRootPath(prevLiNode)
 	if len(root) == 0 {
 		return codeNode.FirstChild.Data
 	}
@@ -308,14 +339,27 @@ func getPrevLiWithCodeText(codeText string, pNode *html.Node) *html.Node {
 	return nil
 }
 
-func checkBlockParagraph(p *html.Node) bool {
-	// traverse children of the paragraph node
-	for c := p.FirstChild; c != nil; c = c.NextSibling {
-		if regexConfigurationBlock.MatchString(c.Data) {
-			return true
+func extractText(n *html.Node) string {
+	switch n.Type { // nolint:exhaustive
+	case html.TextNode:
+		return n.Data
+	case html.ElementNode:
+		sb := strings.Builder{}
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			s := ""
+			if c.Type != html.TextNode {
+				s = extractText(c)
+			} else {
+				s = c.Data
+			}
+			if len(s) != 0 {
+				sb.WriteString(s)
+			}
 		}
+		return sb.String()
+	default:
+		return ""
 	}
-	return false
 }
 
 func (r *Resource) scrapeDocString(n *html.Node, attrName *string, processed map[*html.Node]struct{}) string {
@@ -331,7 +375,7 @@ func (r *Resource) scrapeDocString(n *html.Node, attrName *string, processed map
 	sb := strings.Builder{}
 	if *attrName == "" {
 		*attrName = n.Data
-		if root := getRootPath(n); len(root) != 0 {
+		if root := r.getRootPath(n); len(root) != 0 {
 			*attrName = fmt.Sprintf("%s.%s", root, *attrName)
 		}
 	} else {

diff --git a/pkg/registry/testdata/aws/pm.yaml b/pkg/registry/testdata/aws/pm.yaml
@@ -141,8 +141,10 @@ resources:
             bucket: '- (Required, Forces new resource) The name of the bucket.'
             expected_bucket_owner: '- (Optional, Forces new resource) The account ID of the expected bucket owner.'
             grantee.email_address: '- (Optional) Email address of the grantee. See Regions and Endpoints for supported AWS regions where this argument can be specified.'
+            grantee.id: '- (Optional) The canonical user ID of the grantee.'
             grantee.type: '- (Required) Type of grantee. Valid values: CanonicalUser, AmazonCustomerByEmail, Group.'
             grantee.uri: '- (Optional) URI of the grantee group.'
+            id: '- The bucket, expected_bucket_owner (if configured), and acl (if configured) separated by commas (,).'
             owner.display_name: '- (Optional) The display name of the owner.'
             owner.id: '- (Required) The ID of the owner.'
         importStatements: []