Detect encoding in Response::text() (#256)
* Detect encoding and decode text response Fixes #246 * Try to get encoding from Content-Type header * Remove uchardet encoding detection for now * Add non utf-8 test case for Response::text() * Reduce copies
This commit is contained in:
		| @@ -11,6 +11,7 @@ categories = ["web-programming::http-client"] | |||||||
|  |  | ||||||
| [dependencies] | [dependencies] | ||||||
| bytes = "0.4" | bytes = "0.4" | ||||||
|  | encoding_rs = "0.7" | ||||||
| futures = "0.1.15" | futures = "0.1.15" | ||||||
| hyper = "0.11.9" | hyper = "0.11.9" | ||||||
| hyper-tls = "0.1.2" | hyper-tls = "0.1.2" | ||||||
|   | |||||||
| @@ -129,6 +129,7 @@ | |||||||
| //! [cookiejar_issue]: https://github.com/seanmonstar/reqwest/issues/14 | //! [cookiejar_issue]: https://github.com/seanmonstar/reqwest/issues/14 | ||||||
|  |  | ||||||
| extern crate bytes; | extern crate bytes; | ||||||
|  | extern crate encoding_rs; | ||||||
| #[macro_use] | #[macro_use] | ||||||
| extern crate futures; | extern crate futures; | ||||||
| extern crate hyper; | extern crate hyper; | ||||||
|   | |||||||
| @@ -2,7 +2,9 @@ use std::mem; | |||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::io::{self, Read}; | use std::io::{self, Read}; | ||||||
| use std::time::Duration; | use std::time::Duration; | ||||||
|  | use std::borrow::Cow; | ||||||
|  |  | ||||||
|  | use encoding_rs::{Encoding, UTF_8}; | ||||||
| use futures::{Async, Poll, Stream}; | use futures::{Async, Poll, Stream}; | ||||||
| use serde::de::DeserializeOwned; | use serde::de::DeserializeOwned; | ||||||
| use serde_json; | use serde_json; | ||||||
| @@ -167,6 +169,11 @@ impl Response { | |||||||
|  |  | ||||||
|     /// Get the response text. |     /// Get the response text. | ||||||
|     /// |     /// | ||||||
|  |     /// This method decodes the response body with BOM sniffing | ||||||
|  |     /// and with malformed sequences replaced with the REPLACEMENT CHARACTER. | ||||||
|  |     /// Encoding is determinated from the `charset` parameter of `Content-Type` header, | ||||||
|  |     /// and defaults to `utf-8` if not presented. | ||||||
|  |     /// | ||||||
|     /// # Example |     /// # Example | ||||||
|     /// |     /// | ||||||
|     /// ```rust |     /// ```rust | ||||||
| @@ -180,9 +187,28 @@ impl Response { | |||||||
|         let len = self.headers().get::<::header::ContentLength>() |         let len = self.headers().get::<::header::ContentLength>() | ||||||
|             .map(|ct_len| **ct_len) |             .map(|ct_len| **ct_len) | ||||||
|             .unwrap_or(0); |             .unwrap_or(0); | ||||||
|         let mut content = String::with_capacity(len as usize); |         let mut content = Vec::with_capacity(len as usize); | ||||||
|         self.read_to_string(&mut content).map_err(::error::from)?; |         self.read_to_end(&mut content).map_err(::error::from)?; | ||||||
|         Ok(content) |         let encoding_name = self.headers().get::<::header::ContentType>() | ||||||
|  |             .and_then(|content_type| { | ||||||
|  |                 content_type.get_param("charset") | ||||||
|  |                     .map(|charset| charset.as_str()) | ||||||
|  |             }) | ||||||
|  |             .unwrap_or("utf-8"); | ||||||
|  |         let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8); | ||||||
|  |         // a block because of borrow checker | ||||||
|  |         { | ||||||
|  |             let (text, _, _) = encoding.decode(&content); | ||||||
|  |             match text { | ||||||
|  |                 Cow::Owned(s) => return Ok(s), | ||||||
|  |                 _ => (), | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         unsafe { | ||||||
|  |             // decoding returned Cow::Borrowed, meaning these bytes | ||||||
|  |             // are already valid utf8 | ||||||
|  |             Ok(String::from_utf8_unchecked(content)) | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Copy the response body into a writer. |     /// Copy the response body into a writer. | ||||||
|   | |||||||
| @@ -38,6 +38,41 @@ fn test_response_text() { | |||||||
|     assert_eq!(b"Hello", body.as_bytes()); |     assert_eq!(b"Hello", body.as_bytes()); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[test] | ||||||
|  | fn test_response_non_utf_8_text() { | ||||||
|  |     let server = server! { | ||||||
|  |         request: b"\ | ||||||
|  |             GET /text HTTP/1.1\r\n\ | ||||||
|  |             Host: $HOST\r\n\ | ||||||
|  |             User-Agent: $USERAGENT\r\n\ | ||||||
|  |             Accept: */*\r\n\ | ||||||
|  |             Accept-Encoding: gzip\r\n\ | ||||||
|  |             \r\n\ | ||||||
|  |             ", | ||||||
|  |         response: b"\ | ||||||
|  |             HTTP/1.1 200 OK\r\n\ | ||||||
|  |             Server: test\r\n\ | ||||||
|  |             Content-Length: 4\r\n\ | ||||||
|  |             Content-Type: text/plain; charset=gbk\r\n\ | ||||||
|  |             \r\n\ | ||||||
|  |             \xc4\xe3\xba\xc3\ | ||||||
|  |             " | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     let url = format!("http://{}/text", server.addr()); | ||||||
|  |     let mut res = reqwest::get(&url).unwrap(); | ||||||
|  |     assert_eq!(res.url().as_str(), &url); | ||||||
|  |     assert_eq!(res.status(), reqwest::StatusCode::Ok); | ||||||
|  |     assert_eq!(res.headers().get(), | ||||||
|  |                Some(&reqwest::header::Server::new("test".to_string()))); | ||||||
|  |     assert_eq!(res.headers().get(), | ||||||
|  |                Some(&reqwest::header::ContentLength(4))); | ||||||
|  |  | ||||||
|  |     let body = res.text().unwrap(); | ||||||
|  |     assert_eq!("你好", &body); | ||||||
|  |     assert_eq!(b"\xe4\xbd\xa0\xe5\xa5\xbd", body.as_bytes());  // Now it's utf-8 | ||||||
|  | } | ||||||
|  |  | ||||||
| #[test] | #[test] | ||||||
| fn test_response_copy_to() { | fn test_response_copy_to() { | ||||||
|     let server = server! { |     let server = server! { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user