From 34b791b95aac3617e910dc6be7152630b4482a4b Mon Sep 17 00:00:00 2001 From: Carter Hollman Date: Sun, 20 Oct 2024 03:04:03 -0700 Subject: [PATCH] First working implementation of handleSearch, works with Indeed --- build.gradle | 1 + .../cd_scraper/search/handleSearch.java | 82 +++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 src/main/java/org/codedevils/scraper/cd_scraper/search/handleSearch.java diff --git a/build.gradle b/build.gradle index ee3c252..1c77989 100644 --- a/build.gradle +++ b/build.gradle @@ -30,6 +30,7 @@ ext { } dependencies { + implementation 'org.seleniumhq.selenium:selenium-java:4.25.0' implementation 'org.springframework.boot:spring-boot-starter-data-jpa' implementation 'org.springframework.boot:spring-boot-starter-web' developmentOnly 'org.springframework.boot:spring-boot-docker-compose' diff --git a/src/main/java/org/codedevils/scraper/cd_scraper/search/handleSearch.java b/src/main/java/org/codedevils/scraper/cd_scraper/search/handleSearch.java new file mode 100644 index 0000000..3ae80b4 --- /dev/null +++ b/src/main/java/org/codedevils/scraper/cd_scraper/search/handleSearch.java @@ -0,0 +1,82 @@ +package org.codedevils.scraper.cd_scraper.search; + +import org.openqa.selenium.By; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +public class handleSearch{ + public static ArrayList handleSearch(String link, String criteria){ + //New WebDriver with implicit wait of 5 seconds if elements aren't found on page + WebDriver driver = new ChromeDriver(); + driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(5)); + + //Setup page and declare search variable cuz scope and stuff + driver.get(link); + WebElement searchField; + + + //Try to find the search field via XPATH, to add functionality for more websites can add their specific XPATH to + //try block via concatenation of the following string " | " + try{ + searchField = driver.findElement(By.xpath( + "//input[@type='text' or @type='search' or @placeholder[contains(., 'Search')]]" + )); + }catch(Exception e){ + return null; + } + + //Since we found a search field, clear it, input criteria, submit to navigate to next page + searchField.clear(); + searchField.sendKeys(criteria); + searchField.submit(); + + + + //Should now be on page where we can scrape for the links + ArrayList links = new ArrayList<>(); + + + //Loop is for changing pages if necessary + boolean next = true; + while(next) { + + //If we don't find a next button then we break the loop of changing pages + WebElement nextButton = null; + try{ + nextButton = driver.findElement(By.xpath("//a[@data-testid='pagination-page-next']")); + }catch(Exception e){ + next = false; + } + + + + //Isolate individual result elements into a list to iterate over + List results = driver.findElements(By.xpath("//li[@class='css-1ac2h1w eu4oa1w0']")); + + //For every individual result, find its anchor tag and extract the href attribute. + //That attribute holds the link we are looking for, so we add to outgoing list :) + //NOTE: Some elements in the results do not have link or are filler try/catch ignores them + for (WebElement result : results) { + try { + String temp = result.findElement(By.tagName("a")).getAttribute("href"); + links.add(temp); + } catch (Exception ignored) { + + } + } + + //Continue through the pages :) + if(nextButton != null) + driver.navigate().to(nextButton.getAttribute("href")); + } + + //All done, success!!!!! + driver.quit(); + return links; + } +} +