From 503d351b2e76f4c980287667722edfea7e1167d7 Mon Sep 17 00:00:00 2001 From: Laurent Arnoud Date: Sun, 1 Mar 2020 18:49:39 +0100 Subject: [PATCH 1/4] Add test when url is invalid and panic --- tests/test_reqwest_async.rs | 14 +++++++++++++- tests/test_reqwest_blocking.rs | 12 ++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/test_reqwest_async.rs b/tests/test_reqwest_async.rs index a5ecd0806..5da6b0f50 100644 --- a/tests/test_reqwest_async.rs +++ b/tests/test_reqwest_async.rs @@ -3,6 +3,7 @@ use robotparser::service::RobotsTxtService; use reqwest::Client; use url::Url; use tokio::runtime::Runtime; +use url::{Host, Origin}; #[test] fn test_reqwest_async() { @@ -13,4 +14,15 @@ fn test_reqwest_async() { let robots_txt = robots_txt_response.unwrap().get_result(); let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap(); assert!(robots_txt.can_fetch("*", &fetch_url)); -} \ No newline at end of file + let fetch_url = Url::parse("http://www.python.org/webstats/").unwrap(); + assert!(!robots_txt.can_fetch("*", &fetch_url)); +} + +#[test] +#[should_panic] +fn test_reqwest_blocking_panic_url() { + let client = Client::new(); + let host = Host::Domain("python.org::".into()); + let origin = Origin::Tuple("https".into(), host, 80); + client.fetch_robots_txt(origin); +} diff --git a/tests/test_reqwest_blocking.rs b/tests/test_reqwest_blocking.rs index 1c38c4eb3..42e529df2 100644 --- a/tests/test_reqwest_blocking.rs +++ b/tests/test_reqwest_blocking.rs @@ -2,6 +2,7 @@ use robotparser::http::RobotsTxtClient; use robotparser::service::RobotsTxtService; use reqwest::blocking::Client; use url::Url; +use url::{Host, Origin}; #[test] fn test_reqwest_blocking() { @@ -10,4 +11,15 @@ fn test_reqwest_blocking() { let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result(); let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap(); assert!(robots_txt.can_fetch("*", &fetch_url)); + let fetch_url = Url::parse("https://www.python.org/webstats/").unwrap(); + assert!(!robots_txt.can_fetch("*", &fetch_url)); +} + +#[test] +#[should_panic] +fn test_reqwest_blocking_panic_url() { + let client = Client::new(); + let host = Host::Domain("python.org::".into()); + let origin = Origin::Tuple("https".into(), host, 80); + client.fetch_robots_txt(origin).unwrap().get_result(); } From e150c1f044d0d38ec9b24b26919a43107d70680b Mon Sep 17 00:00:00 2001 From: Laurent Arnoud Date: Sun, 1 Mar 2020 20:02:02 +0100 Subject: [PATCH 2/4] Initial error handling ref https://github.com/messense/robotparser-rs/issues/22 --- src/http/reqwest/async_reqwest.rs | 11 ++++++----- src/http/reqwest/sync_reqwest.rs | 13 +++++++------ src/model.rs | 4 +++- src/model/errors.rs | 21 +++++++++++++++++++++ tests/test_reqwest_async.rs | 8 +++++--- tests/test_reqwest_blocking.rs | 6 ++++-- 6 files changed, 46 insertions(+), 17 deletions(-) create mode 100644 src/model/errors.rs diff --git a/src/http/reqwest/async_reqwest.rs b/src/http/reqwest/async_reqwest.rs index ea87d5f2a..24e5b6bf6 100644 --- a/src/http/reqwest/async_reqwest.rs +++ b/src/http/reqwest/async_reqwest.rs @@ -6,6 +6,7 @@ use reqwest::header::USER_AGENT; use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; use crate::parser::{ParseResult, parse_fetched_robots_txt}; use crate::model::FetchedRobotsTxt; +use crate::model::{RobotparserError, ErrorKind}; use std::pin::Pin; use futures::task::{Context, Poll}; use futures::Future; @@ -15,10 +16,10 @@ use futures::future::ok as future_ok; type FetchFuture = Box>>; impl RobotsTxtClient for Client { - type Result = RobotsTxtResponse; + type Result = Result; fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { let url = format!("{}/robots.txt", origin.unicode_serialization()); - let url = Url::parse(&url).expect("Unable to parse robots.txt url"); + let url = Url::parse(&url).map_err(|err| RobotparserError {kind: ErrorKind::Url(err)})?; let mut request = Request::new(Method::GET, url); let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); let response = self @@ -30,10 +31,10 @@ impl RobotsTxtClient for Client { }); }); let response: Pin>>> = Box::pin(response); - return RobotsTxtResponse { + Ok(RobotsTxtResponse { origin, response, - } + }) } } @@ -73,4 +74,4 @@ impl Future for RobotsTxtResponse { }, } } -} \ No newline at end of file +} diff --git a/src/http/reqwest/sync_reqwest.rs b/src/http/reqwest/sync_reqwest.rs index 0365d66db..a8e433490 100644 --- a/src/http/reqwest/sync_reqwest.rs +++ b/src/http/reqwest/sync_reqwest.rs @@ -1,23 +1,24 @@ use reqwest::blocking::{Client, Request}; -use reqwest::{Method, Error}; +use reqwest::Method; use reqwest::header::HeaderValue; use url::{Origin, Url}; use reqwest::header::USER_AGENT; use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; use crate::parser::{ParseResult, parse_fetched_robots_txt}; use crate::model::FetchedRobotsTxt; +use crate::model::{RobotparserError, ErrorKind}; impl RobotsTxtClient for Client { - type Result = Result, Error>; + type Result = Result, RobotparserError>; fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { let url = format!("{}/robots.txt", origin.unicode_serialization()); - let url = Url::parse(&url).expect("Unable to parse robots.txt url"); + let url = Url::parse(&url).map_err(|err| RobotparserError {kind: ErrorKind::Url(err)})?; let mut request = Request::new(Method::GET, url); let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); - let response = self.execute(request)?; + let response = self.execute(request).map_err(|err| RobotparserError {kind: ErrorKind::HttpClient(err)})?; let status_code = response.status().as_u16(); - let text = response.text()?; + let text = response.text().map_err(|err| RobotparserError {kind: ErrorKind::HttpClient(err)})?; let robots_txt = parse_fetched_robots_txt(origin, status_code, &text); return Ok(robots_txt); } -} \ No newline at end of file +} diff --git a/src/model.rs b/src/model.rs index 483385d4b..16f1885f4 100644 --- a/src/model.rs +++ b/src/model.rs @@ -14,4 +14,6 @@ pub (crate) use self::fetched_robots_txt::FetchedRobotsTxtContainer; mod fetched_robots_txt; pub use self::robots_txt::RobotsTxt; mod path; -pub (crate) use self::path::Path; \ No newline at end of file +pub (crate) use self::path::Path; +mod errors; +pub use self::errors::{RobotparserError, ErrorKind}; diff --git a/src/model/errors.rs b/src/model/errors.rs new file mode 100644 index 000000000..a0ef4a8da --- /dev/null +++ b/src/model/errors.rs @@ -0,0 +1,21 @@ +use std::fmt; + +#[derive(Debug)] +pub struct RobotparserError { + pub kind: ErrorKind, +} + +#[derive(Debug)] +pub enum ErrorKind { + Url(url::ParseError), + HttpClient(reqwest::Error), +} + +impl fmt::Display for RobotparserError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + ErrorKind::Url(ref err) => err.fmt(f), + ErrorKind::HttpClient(ref err) => err.fmt(f), + } + } +} diff --git a/tests/test_reqwest_async.rs b/tests/test_reqwest_async.rs index 5da6b0f50..3701b2b52 100644 --- a/tests/test_reqwest_async.rs +++ b/tests/test_reqwest_async.rs @@ -10,7 +10,7 @@ fn test_reqwest_async() { let mut runtime = Runtime::new().unwrap(); let client = Client::new(); let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap(); - let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin())); + let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin()).unwrap()); let robots_txt = robots_txt_response.unwrap().get_result(); let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap(); assert!(robots_txt.can_fetch("*", &fetch_url)); @@ -19,10 +19,12 @@ fn test_reqwest_async() { } #[test] -#[should_panic] fn test_reqwest_blocking_panic_url() { let client = Client::new(); let host = Host::Domain("python.org::".into()); let origin = Origin::Tuple("https".into(), host, 80); - client.fetch_robots_txt(origin); + match client.fetch_robots_txt(origin) { + Ok(_) => assert!(false), + Err(_) => assert!(true) + } } diff --git a/tests/test_reqwest_blocking.rs b/tests/test_reqwest_blocking.rs index 42e529df2..b82681127 100644 --- a/tests/test_reqwest_blocking.rs +++ b/tests/test_reqwest_blocking.rs @@ -16,10 +16,12 @@ fn test_reqwest_blocking() { } #[test] -#[should_panic] fn test_reqwest_blocking_panic_url() { let client = Client::new(); let host = Host::Domain("python.org::".into()); let origin = Origin::Tuple("https".into(), host, 80); - client.fetch_robots_txt(origin).unwrap().get_result(); + match client.fetch_robots_txt(origin) { + Ok(_) => assert!(false), + Err(_) => assert!(true) + } } From df30fe6701ba4b6df25660ded4d0161026cc37f6 Mon Sep 17 00:00:00 2001 From: Laurent Arnoud Date: Sat, 7 Mar 2020 20:37:44 +0100 Subject: [PATCH 3/4] Rename ErrorKind::HttpClient => ErrorKind::Http --- src/http/reqwest/sync_reqwest.rs | 4 ++-- src/model/errors.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/http/reqwest/sync_reqwest.rs b/src/http/reqwest/sync_reqwest.rs index a8e433490..7c23cebd1 100644 --- a/src/http/reqwest/sync_reqwest.rs +++ b/src/http/reqwest/sync_reqwest.rs @@ -15,9 +15,9 @@ impl RobotsTxtClient for Client { let url = Url::parse(&url).map_err(|err| RobotparserError {kind: ErrorKind::Url(err)})?; let mut request = Request::new(Method::GET, url); let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); - let response = self.execute(request).map_err(|err| RobotparserError {kind: ErrorKind::HttpClient(err)})?; + let response = self.execute(request).map_err(|err| RobotparserError {kind: ErrorKind::Http(err)})?; let status_code = response.status().as_u16(); - let text = response.text().map_err(|err| RobotparserError {kind: ErrorKind::HttpClient(err)})?; + let text = response.text().map_err(|err| RobotparserError {kind: ErrorKind::Http(err)})?; let robots_txt = parse_fetched_robots_txt(origin, status_code, &text); return Ok(robots_txt); } diff --git a/src/model/errors.rs b/src/model/errors.rs index a0ef4a8da..37036495c 100644 --- a/src/model/errors.rs +++ b/src/model/errors.rs @@ -8,14 +8,14 @@ pub struct RobotparserError { #[derive(Debug)] pub enum ErrorKind { Url(url::ParseError), - HttpClient(reqwest::Error), + Http(reqwest::Error), } impl fmt::Display for RobotparserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.kind { ErrorKind::Url(ref err) => err.fmt(f), - ErrorKind::HttpClient(ref err) => err.fmt(f), + ErrorKind::Http(ref err) => err.fmt(f), } } } From b2525b5ac5278185d2384102ebc1655ccec1d10c Mon Sep 17 00:00:00 2001 From: Laurent Arnoud Date: Sun, 8 Mar 2020 12:27:59 +0100 Subject: [PATCH 4/4] Implement std::error::Error and rename to Error --- src/http/reqwest/async_reqwest.rs | 15 ++++++++------- src/http/reqwest/sync_reqwest.rs | 10 +++++----- src/model.rs | 2 +- src/model/errors.rs | 6 ++++-- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/http/reqwest/async_reqwest.rs b/src/http/reqwest/async_reqwest.rs index 24e5b6bf6..288d119cc 100644 --- a/src/http/reqwest/async_reqwest.rs +++ b/src/http/reqwest/async_reqwest.rs @@ -1,25 +1,26 @@ use reqwest::{Client, Request}; -use reqwest::{Method, Error}; +use reqwest::Method; +use reqwest::Error as ReqwestError; use reqwest::header::HeaderValue; use url::{Origin, Url}; use reqwest::header::USER_AGENT; use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; use crate::parser::{ParseResult, parse_fetched_robots_txt}; use crate::model::FetchedRobotsTxt; -use crate::model::{RobotparserError, ErrorKind}; +use crate::model::{Error, ErrorKind}; use std::pin::Pin; use futures::task::{Context, Poll}; use futures::Future; use futures::future::TryFutureExt; use futures::future::ok as future_ok; -type FetchFuture = Box>>; +type FetchFuture = Box>>; impl RobotsTxtClient for Client { - type Result = Result; + type Result = Result; fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { let url = format!("{}/robots.txt", origin.unicode_serialization()); - let url = Url::parse(&url).map_err(|err| RobotparserError {kind: ErrorKind::Url(err)})?; + let url = Url::parse(&url).map_err(|err| Error {kind: ErrorKind::Url(err)})?; let mut request = Request::new(Method::GET, url); let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); let response = self @@ -30,7 +31,7 @@ impl RobotsTxtClient for Client { return future_ok((response_info, response_text)); }); }); - let response: Pin>>> = Box::pin(response); + let response: Pin>>> = Box::pin(response); Ok(RobotsTxtResponse { origin, response, @@ -56,7 +57,7 @@ impl RobotsTxtResponse { } impl Future for RobotsTxtResponse { - type Output = Result, Error>; + type Output = Result, ReqwestError>; fn poll(self: Pin<&mut Self>, cx: &mut Context) -> Poll { let self_mut = self.get_mut(); diff --git a/src/http/reqwest/sync_reqwest.rs b/src/http/reqwest/sync_reqwest.rs index 7c23cebd1..671cca410 100644 --- a/src/http/reqwest/sync_reqwest.rs +++ b/src/http/reqwest/sync_reqwest.rs @@ -6,18 +6,18 @@ use reqwest::header::USER_AGENT; use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; use crate::parser::{ParseResult, parse_fetched_robots_txt}; use crate::model::FetchedRobotsTxt; -use crate::model::{RobotparserError, ErrorKind}; +use crate::model::{Error, ErrorKind}; impl RobotsTxtClient for Client { - type Result = Result, RobotparserError>; + type Result = Result, Error>; fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { let url = format!("{}/robots.txt", origin.unicode_serialization()); - let url = Url::parse(&url).map_err(|err| RobotparserError {kind: ErrorKind::Url(err)})?; + let url = Url::parse(&url).map_err(|err| Error {kind: ErrorKind::Url(err)})?; let mut request = Request::new(Method::GET, url); let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); - let response = self.execute(request).map_err(|err| RobotparserError {kind: ErrorKind::Http(err)})?; + let response = self.execute(request).map_err(|err| Error {kind: ErrorKind::Http(err)})?; let status_code = response.status().as_u16(); - let text = response.text().map_err(|err| RobotparserError {kind: ErrorKind::Http(err)})?; + let text = response.text().map_err(|err| Error {kind: ErrorKind::Http(err)})?; let robots_txt = parse_fetched_robots_txt(origin, status_code, &text); return Ok(robots_txt); } diff --git a/src/model.rs b/src/model.rs index 16f1885f4..d56bd4a23 100644 --- a/src/model.rs +++ b/src/model.rs @@ -16,4 +16,4 @@ pub use self::robots_txt::RobotsTxt; mod path; pub (crate) use self::path::Path; mod errors; -pub use self::errors::{RobotparserError, ErrorKind}; +pub use self::errors::{Error, ErrorKind}; diff --git a/src/model/errors.rs b/src/model/errors.rs index 37036495c..dd631eb53 100644 --- a/src/model/errors.rs +++ b/src/model/errors.rs @@ -1,7 +1,7 @@ use std::fmt; #[derive(Debug)] -pub struct RobotparserError { +pub struct Error { pub kind: ErrorKind, } @@ -11,7 +11,7 @@ pub enum ErrorKind { Http(reqwest::Error), } -impl fmt::Display for RobotparserError { +impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.kind { ErrorKind::Url(ref err) => err.fmt(f), @@ -19,3 +19,5 @@ impl fmt::Display for RobotparserError { } } } + +impl std::error::Error for Error {}