2016-07-14 19 views
0

Inspiriert von akzeptierte Antwort auf this question Ich versuche, PyCurl mit requests-ähnliche Schnittstelle zu wickeln. Alles wäre in Ordnung, aber nachdem ich PyCURL docs beschrieben habe, wie man die Codierung von Nachrichten aus den Headern liest, habe ich das folgende Problem. Der Header-Callback wird für jeden Response-Header aufgerufen, aber erst nachdem der Iterator beginnt, Response-Lines zu liefern, was die Codierung/Zeichensatz-Erkennung sinnlos macht.PyCURL verarbeitet Körper vor Kopfzeilen

Hier ist der Code:

import re 
import io 
import urllib 
import urllib.error 
import http 

import pycurl 


class CurlHTTPStream(object): 

    SELECT_TIMEOUT = 10 
    HTTP_STANDARD_ENCODING = 'iso-8859-1' 

    def __init__(self, method, url, data=None, params=None, headers=None): 
     self.url = url 
     self.received_buffer = io.BytesIO() 

     self.curl = pycurl.Curl() 
     self.curl.setopt(pycurl.CUSTOMREQUEST, method) 
     if headers: 
      self.curl.setopt(
       pycurl.HTTPHEADER, 
       [ 
        '{}: {}'.format(key, value) 
        for key, value in headers.items() 
       ] 
      ) 
     if params: 
      query_string = '&'.join((
       '{}={}'.format(key, value) 
       for key, value in params.items() 
      )) 
      url = '{}?{}'.format(url, query_string) 
     self.curl.setopt(pycurl.URL, url) 
     self.curl.setopt(pycurl.ENCODING, 'gzip') 
     self.curl.setopt(pycurl.CONNECTTIMEOUT, 5) 
     self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function) 
     self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write) 

     self.curl_multi = pycurl.CurlMulti() 
     self.curl_multi.add_handle(self.curl) 

     self.status_code = 0 
     self.headers = {} 

    def _any_data_received(self): 
     return self.received_buffer.tell() != 0 

    def _get_received_data(self): 
     result = self.received_buffer.getvalue() 
     self.received_buffer.truncate(0) 
     self.received_buffer.seek(0) 
     return result 

    def _check_status_code(self): 
     if self.status_code == 0: 
      self.status_code = self.curl.getinfo(pycurl.HTTP_CODE) 
     if self.status_code != 0 and self.status_code != http.HTTPStatus.OK: 
      raise urllib.error.HTTPError(
       self.url, self.status_code, None, None, None 
      ) 

    def _perform_on_curl(self): 
     while True: 
      ret, num_handles = self.curl_multi.perform() 
      if ret != pycurl.E_CALL_MULTI_PERFORM: 
       break 
     return num_handles 

    def _iter_chunks(self): 
     while True: 
      remaining = self._perform_on_curl() 
      if self._any_data_received(): 
       self._check_status_code() 
       yield self._get_received_data() 
      if remaining == 0: 
       break 
      self.curl_multi.select(self.SELECT_TIMEOUT) 

     self._check_status_code() 
     self._check_curl_errors() 

    def _check_curl_errors(self): 
     for f in self.curl_multi.info_read()[2]: 
      raise pycurl.error(*f[1:]) 

    def iter_lines(self): 
     chunks = self._iter_chunks() 
     return self._split_lines_from_chunks(chunks) 

    def _split_lines_from_chunks(self, chunks): 
     print('foo') 
     print(self.headers) 
     charset = None 
     if 'content-type' in self.headers: 
      content_type = self.headers['content-type'].lower() 
      match = re.search('charset=(\S+)', content_type) 
      if match: 
       charset = match.group(1) 
       print('Decoding using %s' % charset) 
     if charset is None: 
      charset = self.HTTP_STANDARD_ENCODING 
      print('Assuming encoding is %s' % charset) 
     pending = None 
     for chunk in chunks: 
      if pending is not None: 
       chunk = pending + chunk 
      lines = chunk.splitlines() 
      if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]: 
       pending = lines.pop() 
      else: 
       pending = None 
      for line in lines: 
       yield line.decode(charset) 
     if pending is not None: 
      yield pending.decode(charset) 

    def header_function(self, header_line): 
     print('hello') 
     header_line = header_line.decode(self.HTTP_STANDARD_ENCODING) 
     if ':' not in header_line: 
      return 
     name, value = header_line.split(':', 1) 
     name = name.strip() 
     value = value.strip() 
     name = name.lower() 
     self.headers[name] = value 


def request(method, url, data=None, params=None, headers=None, 
      stream=False): 
    if stream: 
     return CurlHTTPStream(method, url, data=data, params=params, 
           headers=headers) 

Und das ist, was im Terminal passiert, wenn ich versuche, es zu testen:

Python 3.5.1 (default, Dec 09 2015, 07:29:36) [GCC] on linux 
Type "help", "copyright", "credits" or "license" for more information. 
>>> from pycurl_requests.requests import request 
>>> r = request('GET', 'http://my-couchdb-instance:5984/user-30323561366530622d336135622d343637372d386464392d613038653536663865636566/_changes', params={'feed': 'continuous'}, stream=True) 
>>> for l in r.iter_lines(): 
...  print(l) 
... 
foo 
{} 
Assuming encoding is iso-8859-1 
hello 
hello 
hello 
hello 
hello 
hello 
hello 
{"seq":1,"id":"account","changes":[{"rev":"1-806053b347406e04d1872e13199fd3cf"}]} 
{"seq":4,"id":"identity-bd2c5007-9df3-4ece-9751-843bf5523edd","changes":[{"rev":"1-e3a98ec37776f2cb479b2dcae0266700"}]} 
{"seq":5,"id":"section_phone-0342667c-ecbd-401f-acfe-7bb2a1aa3159","changes":[{"rev":"1-457342bc895c7cb6924ceabd07e1ffcf"}]} 

Es gibt mehr Linien von CouchDB Änderungen kommen füttern, aber ich abgeschnitten den Ausgang da sie nicht relevant sind.

Grundsätzlich foo in der Ausgabe zeigt an, dass es den Block eingibt, in dem Header erwartet werden, aber die nächste Zeile zeigt, dass self.headers leer ist. Und mehrere hello steht für jeden Anruf an header_function(). Wie kann es sein, dass write callback, der den Body in BytesIO schreibt, aufgerufen wird, bevor header callback ausgelöst wird?

Antwort

0

Ich habe die Lösung gefunden. Das Problem war, dass _split_lines_from_chunks(self, chunks) trigerred wurde bevor irgendetwas mit der Antwort kam, also waren die Header auch noch nicht da.

Hier ist der Code, der funktioniert. Der Zeichensatz wird erkannt, wenn die erste Zeile des Körpers verfügbar ist, also habe ich bereits alle Header sicher verarbeitet.

import re 
import io 
import urllib 
import urllib.error 
import http 

import pycurl 


class CurlHTTPStream(object): 

    SELECT_TIMEOUT = 10 
    HTTP_STANDARD_ENCODING = 'iso-8859-1' 

    def __init__(self, method, url, data=None, params=None, headers=None): 
     self.url = url 
     self.received_buffer = io.BytesIO() 

     self.curl = pycurl.Curl() 
     self.curl.setopt(pycurl.CUSTOMREQUEST, method) 
     if headers: 
      self.curl.setopt(
       pycurl.HTTPHEADER, 
       [ 
        '{}: {}'.format(key, value) 
        for key, value in headers.items() 
       ] 
      ) 
     if params: 
      query_string = '&'.join((
       '{}={}'.format(key, value) 
       for key, value in params.items() 
      )) 
      url = '{}?{}'.format(url, query_string) 
     self.curl.setopt(pycurl.URL, url) 
     self.curl.setopt(pycurl.ENCODING, 'gzip') 
     self.curl.setopt(pycurl.CONNECTTIMEOUT, 5) 
     self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function) 
     self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write) 

     self.curl_multi = pycurl.CurlMulti() 
     self.curl_multi.add_handle(self.curl) 

     self.status_code = 0 
     self.headers = {} 
     self._charset = None 

    def _any_data_received(self): 
     return self.received_buffer.tell() != 0 

    def _get_received_data(self): 
     result = self.received_buffer.getvalue() 
     self.received_buffer.truncate(0) 
     self.received_buffer.seek(0) 
     return result 

    def _check_status_code(self): 
     if self.status_code == 0: 
      self.status_code = self.curl.getinfo(pycurl.HTTP_CODE) 
     if self.status_code != 0 and self.status_code != http.HTTPStatus.OK: 
      raise urllib.error.HTTPError(
       self.url, self.status_code, None, None, None 
      ) 

    def _perform_on_curl(self): 
     while True: 
      ret, num_handles = self.curl_multi.perform() 
      if ret != pycurl.E_CALL_MULTI_PERFORM: 
       break 
     return num_handles 

    def _iter_chunks(self): 
     while True: 
      remaining = self._perform_on_curl() 
      if self._any_data_received(): 
       self._check_status_code() 
       yield self._get_received_data() 
      if remaining == 0: 
       break 
      self.curl_multi.select(self.SELECT_TIMEOUT) 

     self._check_status_code() 
     self._check_curl_errors() 

    def _check_curl_errors(self): 
     for f in self.curl_multi.info_read()[2]: 
      raise pycurl.error(*f[1:]) 

    def iter_lines(self): 
     chunks = self._iter_chunks() 
     return self._split_lines_from_chunks(chunks) 

    def _split_lines_from_chunks(self, chunks): 
     print('foo') 
     print(self.headers) 
     pending = None 
     for chunk in chunks: 
      if pending is not None: 
       chunk = pending + chunk 
      lines = chunk.splitlines() 
      if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]: 
       pending = lines.pop() 
      else: 
       pending = None 
      for line in lines: 
       yield line.decode(self.charset) 
     if pending is not None: 
      yield pending.decode(self.charset) 

    @property 
    def charset(self): 
     if self._charset is not None: 
      return self._charset 
     try: 
      content_type = self.headers['content-type'].lower() 
      match = re.search('charset=(\S+)', content_type) 
      if match: 
       self._charset = match.group(1).strip() 
       print('Decoding using %s' % self._charset) 
      else: 
       raise KeyError('charset') 
     except KeyError: 
      self._charset = self.HTTP_STANDARD_ENCODING 
      print('Assuming encoding is %s' % self._charset) 
     return self._charset 

    def header_function(self, header_line): 
     print('hello') 
     header_line = header_line.decode(self.HTTP_STANDARD_ENCODING) 
     if ':' not in header_line: 
      return 
     name, value = header_line.split(':', 1) 
     name = name.strip() 
     value = value.strip() 
     name = name.lower() 
     self.headers[name] = value 


def request(method, url, data=None, params=None, headers=None, 
      stream=False): 
    if stream: 
     return CurlHTTPStream(method, url, data=data, params=params, 
           headers=headers)