Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3333,4 +3333,23 @@ The config file should contain every possible key for documentation purposes."
.success()
.stdout(contains("https://example.com"));
}

/// URLs should NOT be downloaded fully, unless their link has a fragment.
#[test]
fn test_large_file_lazy_download() {
cargo_bin_cmd!()
.arg("-")
.arg("--include-fragments")
.arg("--timeout=5")
.write_stdin(
"
https://www.releases.ubuntu.com/noble/ubuntu-24.04.3-desktop-amd64.iso
https://www.releases.ubuntu.com/noble/ubuntu-24.04.3-live-server-amd64.iso
https://www.releases.ubuntu.com/noble/ubuntu-24.04.3-wsl-amd64.wsl
https://lychee.cli.rs/guides/cli/#options
",
)
.assert()
.success();
}
}
32 changes: 22 additions & 10 deletions lychee-lib/src/checker/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::{
BasicAuthCredentials, ErrorKind, FileType, Status, Uri,
chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain},
quirks::Quirks,
ratelimit::{CacheableResponse, HostPool},
ratelimit::HostPool,
retry::RetryExt,
types::{redirect_history::RedirectHistory, uri::github::GithubUri},
utils::fragment_checker::{FragmentChecker, FragmentInput},
Expand All @@ -11,7 +11,7 @@ use async_trait::async_trait;
use http::{Method, StatusCode};
use octocrab::Octocrab;
use reqwest::{Request, header::CONTENT_TYPE};
use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};
use std::{borrow::Cow, collections::HashSet, path::Path, sync::Arc, time::Duration};
use url::Url;

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -121,15 +121,22 @@ impl WebsiteChecker {
let method = request.method().clone();
let request_url = request.url().clone();

match self.host_pool.execute_request(request).await {
let check_request_fragments = self.include_fragments
&& method == Method::GET
&& request_url.fragment().is_some_and(|x| !x.is_empty());

match self
.host_pool
.execute_request(request, check_request_fragments)
.await
{
Ok(response) => {
let status = Status::new(&response, &self.accepted);
// when `accept=200,429`, `status_code=429` will be treated as success
// but we are not able the check the fragment since it's inapplicable.
if self.include_fragments
if let Some(content) = response.text
&& check_request_fragments
&& response.status.is_success()
&& method == Method::GET
&& request_url.fragment().is_some_and(|x| !x.is_empty())
{
let Some(content_type) = response
.headers
Expand All @@ -152,7 +159,7 @@ impl WebsiteChecker {
_ => return status,
};

self.check_html_fragment(request_url, status, response, file_type)
self.check_html_fragment(request_url, status, &content, file_type)
.await
} else {
status
Expand All @@ -166,13 +173,18 @@ impl WebsiteChecker {
&self,
url: Url,
status: Status,
response: CacheableResponse,
content: &str,
file_type: FileType,
) -> Status {
let content = response.text;
match self
.fragment_checker
.check(FragmentInput { content, file_type }, &url)
.check(
FragmentInput {
content: Cow::Borrowed(content),
file_type,
},
&url,
)
.await
{
Ok(true) => status,
Expand Down
46 changes: 32 additions & 14 deletions lychee-lib/src/ratelimit/host/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,19 @@ impl Host {
}
}

/// Check if a URI is cached and return the cached status if valid
///
/// # Panics
///
/// Panics if the statistics mutex is poisoned
fn get_cached_status(&self, uri: &Uri) -> Option<CacheableResponse> {
self.cache.get(uri).map(|v| v.clone())
/// Check if a URI is cached and returns the cached response if it is valid
/// and satisfies the `needs_body` requirement.
fn get_cached_status(&self, uri: &Uri, needs_body: bool) -> Option<CacheableResponse> {
let cached = self.cache.get(uri)?.clone();
if needs_body {
if cached.text.is_some() {
Some(cached)
} else {
None
}
} else {
Some(cached)
}
}

fn record_cache_hit(&self) {
Expand Down Expand Up @@ -128,11 +134,18 @@ impl Host {
/// # Panics
///
/// Panics if the statistics mutex is poisoned
pub(crate) async fn execute_request(&self, request: Request) -> Result<CacheableResponse> {
let uri = Uri::from(request.url().clone());
pub(crate) async fn execute_request(
&self,
request: Request,
needs_body: bool,
) -> Result<CacheableResponse> {
let mut url = request.url().clone();
url.set_fragment(None);
let uri = Uri::from(url);

let _permit = self.acquire_semaphore().await;

if let Some(cached) = self.get_cached_status(&uri) {
if let Some(cached) = self.get_cached_status(&uri, needs_body) {
self.record_cache_hit();
return Ok(cached);
}
Expand All @@ -143,20 +156,25 @@ impl Host {
rate_limiter.until_ready().await;
}

if let Some(cached) = self.get_cached_status(&uri) {
if let Some(cached) = self.get_cached_status(&uri, needs_body) {
self.record_cache_hit();
return Ok(cached);
}

self.record_cache_miss();
self.perform_request(request, uri).await
self.perform_request(request, uri, needs_body).await
}

pub(crate) const fn get_client(&self) -> &ReqwestClient {
&self.client
}

async fn perform_request(&self, request: Request, uri: Uri) -> Result<CacheableResponse> {
async fn perform_request(
&self,
request: Request,
uri: Uri,
needs_body: bool,
) -> Result<CacheableResponse> {
let start_time = Instant::now();
let response = match self.client.execute(request).await {
Ok(response) => response,
Expand All @@ -170,7 +188,7 @@ impl Host {
self.update_backoff(response.status());
self.handle_rate_limit_headers(&response);

let response = CacheableResponse::try_from(response).await?;
let response = CacheableResponse::from_response(response, needs_body).await?;
self.cache_result(&uri, response.clone());
Ok(response)
}
Expand Down
10 changes: 7 additions & 3 deletions lychee-lib/src/ratelimit/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,21 @@ use crate::{ErrorKind, Result};
#[derive(Debug, Clone)]
pub(crate) struct CacheableResponse {
pub(crate) status: reqwest::StatusCode,
pub(crate) text: String,
pub(crate) text: Option<String>,
pub(crate) headers: HeaderMap,
pub(crate) url: Url,
}

impl CacheableResponse {
async fn try_from(response: Response) -> Result<Self> {
async fn from_response(response: Response, needs_body: bool) -> Result<Self> {
let status = response.status();
let headers = response.headers().clone();
let url = response.url().clone();
let text = response.text().await.map_err(ErrorKind::ReadResponseBody)?;
let text = if needs_body {
Some(response.text().await.map_err(ErrorKind::ReadResponseBody)?)
} else {
None
};

Ok(Self {
status,
Expand Down
8 changes: 6 additions & 2 deletions lychee-lib/src/ratelimit/pool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,15 @@ impl HostPool {
/// Fails if:
/// - The request URL has no valid hostname
/// - The underlying HTTP request fails
pub(crate) async fn execute_request(&self, request: Request) -> Result<CacheableResponse> {
pub(crate) async fn execute_request(
&self,
request: Request,
needs_body: bool,
) -> Result<CacheableResponse> {
let url = request.url();
let host_key = HostKey::try_from(url)?;
let host = self.get_or_create_host(host_key);
host.execute_request(request).await
host.execute_request(request, needs_body).await
}

/// Try to build a [`Request`]
Expand Down
13 changes: 8 additions & 5 deletions lychee-lib/src/utils/fragment_checker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,21 @@ use tokio::{fs, sync::Mutex};
use url::Url;

/// Holds the content and file type of the fragment input.
pub(crate) struct FragmentInput {
pub content: String,
pub(crate) struct FragmentInput<'a> {
pub content: Cow<'a, str>,
pub file_type: FileType,
}

impl FragmentInput {
impl FragmentInput<'_> {
pub(crate) async fn from_path(path: &Path) -> Result<Self> {
let content = fs::read_to_string(path)
.await
.map_err(|err| ErrorKind::ReadFileInput(err, path.to_path_buf()))?;
let file_type = FileType::from(path);
Ok(Self { content, file_type })
Ok(Self {
content: Cow::Owned(content),
file_type,
})
}
}

Expand Down Expand Up @@ -117,7 +120,7 @@ impl FragmentChecker {
/// (Empty # and #top fragments are always valid, triggering the browser to scroll to top.)
///
/// In all other cases, returns true.
pub(crate) async fn check(&self, input: FragmentInput, url: &Url) -> Result<bool> {
pub(crate) async fn check(&self, input: FragmentInput<'_>, url: &Url) -> Result<bool> {
let Some(fragment) = url.fragment() else {
return Ok(true);
};
Expand Down