Detect encoding in Response::text() (#256)

* Detect encoding and decode text response

Fixes #246

* Try to get encoding from Content-Type header

* Remove uchardet encoding detection for now

* Add non utf-8 test case for Response::text()

* Reduce copies
This commit is contained in:
messense
2018-02-16 03:01:57 +08:00
committed by Sean McArthur
parent f241fce38d
commit 0203fad886
4 changed files with 66 additions and 3 deletions

View File

@@ -11,6 +11,7 @@ categories = ["web-programming::http-client"]
[dependencies]
bytes = "0.4"
encoding_rs = "0.7"
futures = "0.1.15"
hyper = "0.11.9"
hyper-tls = "0.1.2"

View File

@@ -129,6 +129,7 @@
//! [cookiejar_issue]: https://github.com/seanmonstar/reqwest/issues/14
extern crate bytes;
extern crate encoding_rs;
#[macro_use]
extern crate futures;
extern crate hyper;

View File

@@ -2,7 +2,9 @@ use std::mem;
use std::fmt;
use std::io::{self, Read};
use std::time::Duration;
use std::borrow::Cow;
use encoding_rs::{Encoding, UTF_8};
use futures::{Async, Poll, Stream};
use serde::de::DeserializeOwned;
use serde_json;
@@ -167,6 +169,11 @@ impl Response {
/// Get the response text.
///
/// This method decodes the response body with BOM sniffing
/// and with malformed sequences replaced with the REPLACEMENT CHARACTER.
/// Encoding is determinated from the `charset` parameter of `Content-Type` header,
/// and defaults to `utf-8` if not presented.
///
/// # Example
///
/// ```rust
@@ -180,9 +187,28 @@ impl Response {
let len = self.headers().get::<::header::ContentLength>()
.map(|ct_len| **ct_len)
.unwrap_or(0);
let mut content = String::with_capacity(len as usize);
self.read_to_string(&mut content).map_err(::error::from)?;
Ok(content)
let mut content = Vec::with_capacity(len as usize);
self.read_to_end(&mut content).map_err(::error::from)?;
let encoding_name = self.headers().get::<::header::ContentType>()
.and_then(|content_type| {
content_type.get_param("charset")
.map(|charset| charset.as_str())
})
.unwrap_or("utf-8");
let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
// a block because of borrow checker
{
let (text, _, _) = encoding.decode(&content);
match text {
Cow::Owned(s) => return Ok(s),
_ => (),
}
}
unsafe {
// decoding returned Cow::Borrowed, meaning these bytes
// are already valid utf8
Ok(String::from_utf8_unchecked(content))
}
}
/// Copy the response body into a writer.

View File

@@ -38,6 +38,41 @@ fn test_response_text() {
assert_eq!(b"Hello", body.as_bytes());
}
#[test]
fn test_response_non_utf_8_text() {
let server = server! {
request: b"\
GET /text HTTP/1.1\r\n\
Host: $HOST\r\n\
User-Agent: $USERAGENT\r\n\
Accept: */*\r\n\
Accept-Encoding: gzip\r\n\
\r\n\
",
response: b"\
HTTP/1.1 200 OK\r\n\
Server: test\r\n\
Content-Length: 4\r\n\
Content-Type: text/plain; charset=gbk\r\n\
\r\n\
\xc4\xe3\xba\xc3\
"
};
let url = format!("http://{}/text", server.addr());
let mut res = reqwest::get(&url).unwrap();
assert_eq!(res.url().as_str(), &url);
assert_eq!(res.status(), reqwest::StatusCode::Ok);
assert_eq!(res.headers().get(),
Some(&reqwest::header::Server::new("test".to_string())));
assert_eq!(res.headers().get(),
Some(&reqwest::header::ContentLength(4)));
let body = res.text().unwrap();
assert_eq!("你好", &body);
assert_eq!(b"\xe4\xbd\xa0\xe5\xa5\xbd", body.as_bytes()); // Now it's utf-8
}
#[test]
fn test_response_copy_to() {
let server = server! {