简单分析C之Curl模块同php的curl和python的pycurl模块的关系
缘起:以前一直喜欢用scrapy做爬虫,并且实践效果也很好,后来由于单位让自己写一套分布式爬虫(python实现),替代公司原有的爬虫(php实现),大致用于实践后,发现效果是比原来的效果好,原来能做配置的网站20个里能配置10个,现在20个里能配置16个,分析原因,是架构设计方面有那么一点点扩充性,在大致架构不变的基础上,可进行有限的扩展,而其实实现的原理都是通过CURL来实现的。
php的curl,是在php发布程序的ext文件中,作为一个php自带的支持,需要改写php的配置文件,修改php.ini,将;extension=php_curl.dll前的分号去掉。
python的pycurl,不是python自带的支持程序,python在做爬虫一般都是用urllib,urllib2,twisted等,比较少的使用pycurl.安装略.
c的curl,是前面2个语言的curl父程序,是c的curl才有了php的curl和python的pycurl,同时,python的pycurl文档说明了只实现了部分功能,即是一个c的curl的阉割版。泪奔,原来用了那么长时间的东西,连冰山一角都没触碰,或者python的pycurl也只是会用其中的一个或少数几个功能。
如何用:
C的curl:
#include <stdio.h> #include <curl/curl.h> int main(void) { CURL *curl; CURLcode res; curl = curl_easy_init(); if(curl) { /* First set the URL that is about to receive our POST. This URL can just as well be a https:// URL if that is what should receive the data. */ curl_easy_setopt(curl, CURLOPT_URL, "http://postit.example.com/moo.cgi"); /* Now specify the POST data */ curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "name=daniel&project=curl"); /* Perform the request, res will get the return code */ res = curl_easy_perform(curl); /* always cleanup */ curl_easy_cleanup(curl); } return 0; }
php的curl:
<?php $c = curl_init(); curl_setopt($c, CURLOPT_URL, 'http://www.baidu.com'); $data = curl_exec($c); curl_close($c); echo $c; ?>
python的pycurl:
import pycurl def body(buffer): print buffer c = pycurl.Curl() c.setopt(pycurl.URL, "http://www.baidu.com/") c.setopt(pycurl.WRITEFUNCTION, body) c.perform()
主要原理:
C:
使用到的数据结构:
typedef void CURL; /*当初始化什么的时候只是一个void类型*/ struct SessionHandle { struct Names dns; struct Curl_multi *multi; /* 用于多线程处理*/ struct Curl_one_easy *multi_pos; /* if non-NULL, points to the its position in multi controlling structure to assist in removal. */ struct Curl_share *share; /* Share, handles global variable mutexing */ struct HandleData reqdata; /* Request-specific data */ struct UserDefined set; /* values set by the libcurl user ,用于setopt等*/ struct DynamicStatic change; /* possibly modified userdefined data */ struct CookieInfo *cookies; /* the cookies, read from files and servers */ struct Progress progress; /* for all the progress meter data */ struct UrlState state; /* struct for fields used for state info and other dynamic purposes */ struct PureInfo info; /* stats, reports and info data */ #if defined(CURL_DOES_CONVERSIONS) && defined(HAVE_ICONV) iconv_t outbound_cd; /* for translating to the network encoding */ iconv_t inbound_cd; /* for translating from the network encoding */ iconv_t utf8_cd; /* for translating to UTF8 */ #endif /* CURL_DOES_CONVERSIONS && HAVE_ICONV */ unsigned int magic; /* set to a CURLEASY_MAGIC_NUMBER */ }; struct UserDefined { FILE *err; /* the stderr user data goes here */ void *debugdata; /* the data that will be passed to fdebug */ char *errorbuffer; /* (Static) store failure messages in here */ long proxyport; /* If non-zero, use this port number by default. If the proxy string features a ":[port]" that one will override this. */ /**一下省略10000行- -**/ };
使用的方法1:
1.初始化curl,得到sessionhandler结构体空间 CURL *curl_easy_init(void) { CURLcode res; struct SessionHandle *data; /* Make sure we inited the global SSL stuff */ if (!initialized) { res = curl_global_init(CURL_GLOBAL_DEFAULT); if(res) { /* something in the global init failed, return nothing */ DEBUGF(fprintf(stderr, "Error: curl_global_init failed\n")); return NULL; } } /* We use curl_open() with undefined URL so far */ res = Curl_open(&data); if(res != CURLE_OK) { DEBUGF(fprintf(stderr, "Error: Curl_open failed\n")); return NULL; } return data; }
方法2.
设置参数: CURLcode curl_easy_setopt(CURL *curl, CURLoption tag, ...) { va_list arg; struct SessionHandle *data = curl; CURLcode ret; if(!curl) return CURLE_BAD_FUNCTION_ARGUMENT; va_start(arg, tag); ret = Curl_setopt(data, tag, arg); va_end(arg); return ret; } CURLcode Curl_setopt(struct SessionHandle *data, CURLoption option, va_list param) { char *argptr; CURLcode result = CURLE_OK; #ifndef CURL_DISABLE_HTTP curl_off_t bigsize; #endif switch(option) { case CURLOPT_DNS_CACHE_TIMEOUT: data->set.dns_cache_timeout = va_arg(param, long); break; case CURLOPT_DNS_USE_GLOBAL_CACHE: { long use_cache = va_arg(param, long); if (use_cache) Curl_global_host_cache_init(); data->set.global_dns_cache = (bool)(0 != use_cache); } break; case CURLOPT_SSL_CIPHER_LIST: /* set a list of cipher we want to use in the SSL connection */ result = Curl_setstropt(&data->set.str[STRING_SSL_CIPHER_LIST], va_arg(param, char *)); break; case CURLOPT_RANDOM_FILE: /* * This is the path name to a file that contains random data to seed * the random SSL stuff with. The file is only used for reading. */ result = Curl_setstropt(&data->set.str[STRING_SSL_RANDOM_FILE], va_arg(param, char *)); break; case CURLOPT_EGDSOCKET: /* * The Entropy Gathering Daemon socket pathname */ result = Curl_setstropt(&data->set.str[STRING_SSL_EGDSOCKET], va_arg(param, char *)); break; case CURLOPT_MAXCONNECTS: /* * Set the absolute number of maximum simultaneous alive connection that * libcurl is allowed to have. */ result = Curl_ch_connc(data, data->state.connc, va_arg(param, long)); break; case CURLOPT_FORBID_REUSE: /* * When this transfer is done, it must not be left to be reused by a * subsequent transfer but shall be closed immediately. */ data->set.reuse_forbid = (bool)(0 != va_arg(param, long)); break; case CURLOPT_FRESH_CONNECT: /* * This transfer shall not use a previously cached connection but * should be made with a fresh new connect! */ data->set.reuse_fresh = (bool)(0 != va_arg(param, long)); break; case CURLOPT_VERBOSE: /* * Verbose means infof() calls that give a lot of information about * the connection and transfer procedures as well as internal choices. */ data->set.verbose = (bool)(0 != va_arg(param, long)); break; case CURLOPT_HEADER: /* * Set to include the header in the general data output stream. */ data->set.include_header = (bool)(0 != va_arg(param, long)); break; case CURLOPT_NOPROGRESS: /* * Shut off the internal supported progress meter */ data->set.hide_progress = (bool)(0 != va_arg(param, long)); if(data->set.hide_progress) data->progress.flags |= PGRS_HIDE; else data->progress.flags &= ~PGRS_HIDE; break; case CURLOPT_NOBODY: /* * Do not include the body part in the output data stream. */ data->set.opt_no_body = (bool)(0 != va_arg(param, long)); if(data->set.opt_no_body) /* in HTTP lingo, this means using the HEAD request */ data->set.httpreq = HTTPREQ_HEAD; break; case CURLOPT_FAILONERROR: /* * Don't output the >=300 error code HTML-page, but instead only * return error. */ data->set.http_fail_on_error = (bool)(0 != va_arg(param, long)); break; case CURLOPT_UPLOAD: case CURLOPT_PUT: /* * We want to sent data to the remote host. If this is HTTP, that equals * using the PUT request. */ data->set.upload = (bool)(0 != va_arg(param, long)); if(data->set.upload) /* If this is HTTP, PUT is what's needed to "upload" */ data->set.httpreq = HTTPREQ_PUT; break; case CURLOPT_FILETIME: /* * Try to get the file time of the remote document. The time will * later (possibly) become available using curl_easy_getinfo(). */ data->set.get_filetime = (bool)(0 != va_arg(param, long)); break; case CURLOPT_FTP_CREATE_MISSING_DIRS: /* * An FTP option that modifies an upload to create missing directories on * the server. */ data->set.ftp_create_missing_dirs = (bool)(0 != va_arg(param, long)); break; case CURLOPT_FTP_RESPONSE_TIMEOUT: /* * An FTP option that specifies how quickly an FTP response must be * obtained before it is considered failure. */ data->set.ftp_response_timeout = va_arg( param , long ) * 1000; break; case CURLOPT_DIRLISTONLY: /* * An option that changes the command to one that asks for a list * only, no file info details. */ data->set.ftp_list_only = (bool)(0 != va_arg(param, long)); break; case CURLOPT_APPEND: /* * We want to upload and append to an existing file. */ data->set.ftp_append = (bool)(0 != va_arg(param, long)); break; case CURLOPT_FTP_FILEMETHOD: /* * How do access files over FTP. */ data->set.ftp_filemethod = (curl_ftpfile)va_arg(param, long); break; case CURLOPT_NETRC: /* * Parse the $HOME/.netrc file */ data->set.use_netrc = (enum CURL_NETRC_OPTION)va_arg(param, long); break; case CURLOPT_NETRC_FILE: /* * Use this file instead of the $HOME/.netrc file */ result = Curl_setstropt(&data->set.str[STRING_NETRC_FILE], va_arg(param, char *)); break; case CURLOPT_TRANSFERTEXT: /* * This option was previously named 'FTPASCII'. Renamed to work with * more protocols than merely FTP. * * Transfer using ASCII (instead of BINARY). */ data->set.prefer_ascii = (bool)(0 != va_arg(param, long)); break; case CURLOPT_TIMECONDITION: /* * Set HTTP time condition. This must be one of the defines in the * curl/curl.h header file. */ data->set.timecondition = (curl_TimeCond)va_arg(param, long); break; case CURLOPT_TIMEVALUE: /* * This is the value to compare with the remote document with the * method set with CURLOPT_TIMECONDITION */ data->set.timevalue = (time_t)va_arg(param, long); break; case CURLOPT_SSLVERSION: /* * Set explicit SSL version to try to connect with, as some SSL * implementations are lame. */ data->set.ssl.version = va_arg(param, long); break; #ifndef CURL_DISABLE_HTTP case CURLOPT_AUTOREFERER: /* * Switch on automatic referer that gets set if curl follows locations. */ data->set.http_auto_referer = (bool)(0 != va_arg(param, long)); break; case CURLOPT_ENCODING: /* * String to use at the value of Accept-Encoding header. * * If the encoding is set to "" we use an Accept-Encoding header that * encompasses all the encodings we support. * If the encoding is set to NULL we don't send an Accept-Encoding header * and ignore an received Content-Encoding header. * */ argptr = va_arg(param, char *); result = Curl_setstropt(&data->set.str[STRING_ENCODING], (argptr && !*argptr)? (char *) ALL_CONTENT_ENCODINGS: argptr); break; case CURLOPT_FOLLOWLOCATION: /* * Follow Location: header hints on a HTTP-server. */ data->set.http_follow_location = (bool)(0 != va_arg(param, long)); break; case CURLOPT_UNRESTRICTED_AUTH: /* * Send authentication (user+password) when following locations, even when * hostname changed. */ data->set.http_disable_hostname_check_before_authentication = (bool)(0 != va_arg(param, long)); break; case CURLOPT_MAXREDIRS: /* * The maximum amount of hops you allow curl to follow Location: * headers. This should mostly be used to detect never-ending loops. */ data->set.maxredirs = va_arg(param, long); break; case CURLOPT_POST301: /* * Obey RFC 2616/10.3.2 and resubmit a POST as a POST after a 301. */ data->set.post301 = (bool)(0 != va_arg(param, long)); break; case CURLOPT_POST: /* Does this option serve a purpose anymore? Yes it does, when CURLOPT_POSTFIELDS isn't used and the POST data is read off the callback! */ if(va_arg(param, long)) { data->set.httpreq = HTTPREQ_POST; data->set.opt_no_body = FALSE; /* this is implied */ } else data->set.httpreq = HTTPREQ_GET; break; case CURLOPT_COPYPOSTFIELDS: /* * A string with POST data. Makes curl HTTP POST. Even if it is NULL. * If needed, CURLOPT_POSTFIELDSIZE must have been set prior to * CURLOPT_COPYPOSTFIELDS and not altered later. */ argptr = va_arg(param, char *); if (!argptr || data->set.postfieldsize == -1) result = Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], argptr); else { /* * Check that requested length does not overflow the size_t type. */ if ((data->set.postfieldsize < 0) || ((sizeof(curl_off_t) != sizeof(size_t)) && (data->set.postfieldsize > (curl_off_t)((size_t)-1)))) result = CURLE_OUT_OF_MEMORY; else { char * p; (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL); /* Allocate even when size == 0. This satisfies the need of possible later address compare to detect the COPYPOSTFIELDS mode, and to mark that postfields is used rather than read function or form data. */ p = malloc((size_t)(data->set.postfieldsize?data->set.postfieldsize:1)); if (!p) result = CURLE_OUT_OF_MEMORY; else { if (data->set.postfieldsize) memcpy(p, argptr, data->set.postfieldsize); data->set.str[STRING_COPYPOSTFIELDS] = p; } } } data->set.postfields = data->set.str[STRING_COPYPOSTFIELDS]; data->set.httpreq = HTTPREQ_POST; break; case CURLOPT_POSTFIELDS: /* * Like above, but use static data instead of copying it. */ data->set.postfields = va_arg(param, void *); /* Release old copied data. */ (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL); data->set.httpreq = HTTPREQ_POST; break; case CURLOPT_POSTFIELDSIZE: /* * The size of the POSTFIELD data to prevent libcurl to do strlen() to * figure it out. Enables binary posts. */ bigsize = va_arg(param, long); if (data->set.postfieldsize < bigsize && data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) { /* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */ (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL); data->set.postfields = NULL; } data->set.postfieldsize = bigsize; break; case CURLOPT_POSTFIELDSIZE_LARGE: /* * The size of the POSTFIELD data to prevent libcurl to do strlen() to * figure it out. Enables binary posts. */ bigsize = va_arg(param, curl_off_t); if (data->set.postfieldsize < bigsize && data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) { /* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */ (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL); data->set.postfields = NULL; } data->set.postfieldsize = bigsize; break; case CURLOPT_HTTPPOST: /* * Set to make us do HTTP POST */ data->set.httppost = va_arg(param, struct curl_httppost *); data->set.httpreq = HTTPREQ_POST_FORM; data->set.opt_no_body = FALSE; /* this is implied */ break; case CURLOPT_REFERER: /* * String to set in the HTTP Referer: field. */ if(data->change.referer_alloc) { free(data->change.referer); data->change.referer_alloc = FALSE; } result = Curl_setstropt(&data->set.str[STRING_SET_REFERER], va_arg(param, char *)); data->change.referer = data->set.str[STRING_SET_REFERER]; break; /**中间省略10000行case情况,但都是想data数据修正值*/ default: /* unknown tag and its companion, just ignore: */ result = CURLE_FAILED_INIT; /* correct this */ break; } return result; }
3.真正发送请求:
CURLcode curl_easy_perform(CURL *easy) { CURLM *multi; CURLMcode mcode; CURLcode code = CURLE_OK; int still_running; struct timeval timeout; int rc; CURLMsg *msg; fd_set fdread; fd_set fdwrite; fd_set fdexcep; int maxfd; if(!easy) return CURLE_BAD_FUNCTION_ARGUMENT; multi = curl_multi_init(); if(!multi) return CURLE_OUT_OF_MEMORY; mcode = curl_multi_add_handle(multi, easy); if(mcode) { curl_multi_cleanup(multi); if(mcode == CURLM_OUT_OF_MEMORY) return CURLE_OUT_OF_MEMORY; else return CURLE_FAILED_INIT; } /* we start some action by calling perform right away */ do { while(CURLM_CALL_MULTI_PERFORM == curl_multi_perform(multi, &still_running)); if(!still_running) break; FD_ZERO(&fdread); FD_ZERO(&fdwrite); FD_ZERO(&fdexcep); /* timeout once per second */ timeout.tv_sec = 1; timeout.tv_usec = 0; /* Old deprecated style: get file descriptors from the transfers */ curl_multi_fdset(multi, &fdread, &fdwrite, &fdexcep, &maxfd); rc = Curl_select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout); /* The way is to extract the sockets and wait for them without using select. This whole alternative version should probably rather use the curl_multi_socket() approach. */ if(rc == -1) /* select error */ break; /* timeout or data to send/receive => loop! */ } while(still_running); msg = curl_multi_info_read(multi, &rc); if(msg) code = msg->data.result; mcode = curl_multi_remove_handle(multi, easy); /* what to do if it fails? */ mcode = curl_multi_cleanup(multi); /* what to do if it fails? */ return code; }
4.从内存去除申请的空间:
void curl_easy_cleanup(CURL *curl) { struct SessionHandle *data = (struct SessionHandle *)curl; if(!data) return; Curl_close(data); }
php:
1.使用的数据结构:
typedef struct { struct _php_curl_error err; struct _php_curl_free *to_free; struct _php_curl_send_headers header; void ***thread_ctx; CURL *cp; /* php主要申请这个结构体,但这个结构体包含了C的CURL这个类型的结构体,所以可以采用ch->cp来设置这个结构体内容*/ php_curl_handlers *handlers; long id; unsigned int uses; zend_bool in_callback; zval *clone; } php_curl;
2. 使用的方法:
PHP_FUNCTION(curl_init) { php_curl *ch; CURL *cp; zval *clone; char *url = NULL; int url_len = 0; char *cainfo; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|s", &url, &url_len) == FAILURE) { return; } cp = curl_easy_init(); if (!cp) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Could not initialize a new cURL handle"); RETURN_FALSE; } alloc_curl_handle(&ch); TSRMLS_SET_CTX(ch->thread_ctx); ch->cp = cp; ch->handlers->write->method = PHP_CURL_STDOUT; ch->handlers->write->type = PHP_CURL_ASCII; ch->handlers->read->method = PHP_CURL_DIRECT; ch->handlers->write_header->method = PHP_CURL_IGNORE; ch->uses = 0; MAKE_STD_ZVAL(clone); ch->clone = clone; curl_easy_setopt(ch->cp, CURLOPT_NOPROGRESS, 1); curl_easy_setopt(ch->cp, CURLOPT_VERBOSE, 0); curl_easy_setopt(ch->cp, CURLOPT_ERRORBUFFER, ch->err.str); curl_easy_setopt(ch->cp, CURLOPT_WRITEFUNCTION, curl_write); curl_easy_setopt(ch->cp, CURLOPT_FILE, (void *) ch); curl_easy_setopt(ch->cp, CURLOPT_READFUNCTION, curl_read); curl_easy_setopt(ch->cp, CURLOPT_INFILE, (void *) ch); curl_easy_setopt(ch->cp, CURLOPT_HEADERFUNCTION, curl_write_header); curl_easy_setopt(ch->cp, CURLOPT_WRITEHEADER, (void *) ch); curl_easy_setopt(ch->cp, CURLOPT_DNS_USE_GLOBAL_CACHE, 1); curl_easy_setopt(ch->cp, CURLOPT_DNS_CACHE_TIMEOUT, 120); curl_easy_setopt(ch->cp, CURLOPT_MAXREDIRS, 20); /* prevent infinite redirects */ cainfo = INI_STR("curl.cainfo"); if (cainfo && strlen(cainfo) > 0) { curl_easy_setopt(ch->cp, CURLOPT_CAINFO, cainfo); } #if defined(ZTS) curl_easy_setopt(ch->cp, CURLOPT_NOSIGNAL, 1); #endif if (url) { if (!php_curl_option_url(ch, url, url_len)) { _php_curl_close_ex(ch TSRMLS_CC); RETURN_FALSE; } } ZEND_REGISTER_RESOURCE(return_value, ch, le_curl); ch->id = Z_LVAL_P(return_value); }
执行真实下载
PHP_FUNCTION(curl_exec) { CURLcode error; zval *zid; php_curl *ch; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) { return; } ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl); _php_curl_verify_handlers(ch, 1 TSRMLS_CC); _php_curl_cleanup_handle(ch); error = curl_easy_perform(ch->cp); SAVE_CURL_ERROR(ch, error); /* CURLE_PARTIAL_FILE is returned by HEAD requests */ if (error != CURLE_OK && error != CURLE_PARTIAL_FILE) { if (ch->handlers->write->buf.len > 0) { smart_str_free(&ch->handlers->write->buf); } RETURN_FALSE; } if (ch->handlers->std_err) { php_stream *stream; stream = (php_stream*)zend_fetch_resource(&ch->handlers->std_err TSRMLS_CC, -1, NULL, NULL, 2, php_file_le_stream(), php_file_le_pstream()); if (stream) { php_stream_flush(stream); } } if (ch->handlers->write->method == PHP_CURL_RETURN && ch->handlers->write->buf.len > 0) { smart_str_0(&ch->handlers->write->buf); RETURN_STRINGL(ch->handlers->write->buf.c, ch->handlers->write->buf.len, 1); } /* flush the file handle, so any remaining data is synched to disk */ if (ch->handlers->write->method == PHP_CURL_FILE && ch->handlers->write->fp) { fflush(ch->handlers->write->fp); } if (ch->handlers->write_header->method == PHP_CURL_FILE && ch->handlers->write_header->fp) { fflush(ch->handlers->write_header->fp); } if (ch->handlers->write->method == PHP_CURL_RETURN) { RETURN_EMPTY_STRING(); } else { RETURN_TRUE; } }
关闭程序,清空内存
PHP_FUNCTION(curl_close) { zval *zid; php_curl *ch; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) { return; } ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl); if (ch->in_callback) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Attempt to close cURL handle from a callback"); return; } if (ch->uses) { ch->uses--; } else { zend_list_delete(Z_LVAL_P(zid)); } }
python的pycurl
1.使用的数据结构:
typedef struct {
PyObject_HEAD
PyObject *dict; /* Python attributes dictionary */
CURL *handle; /*引用C的curl的数据结构*/
PyThreadState *state;
CurlMultiObject *multi_stack;
CurlShareObject *share;
struct curl_httppost *httppost;
struct curl_slist *httpheader;
struct curl_slist *http200aliases;
struct curl_slist *quote;
struct curl_slist *postquote;
struct curl_slist *prequote;
/* callbacks */
PyObject *w_cb;
PyObject *h_cb;
PyObject *r_cb;
PyObject *pro_cb;
PyObject *debug_cb;
PyObject *ioctl_cb;
PyObject *opensocket_cb;
/* file objects */
PyObject *readdata_fp;
PyObject *writedata_fp;
PyObject *writeheader_fp;
/* misc */
void *options[OPTIONS_SIZE]; /* for OBJECTPOINT options */
char error[CURL_ERROR_SIZE+1];
} CurlObject;
方法:
1.初始化对象:
static CurlObject * do_curl_new(PyObject *dummy) { CurlObject *self = NULL; int res; char *s = NULL; UNUSED(dummy); /* Allocate python curl object */ self = util_curl_new(); if (self == NULL) return NULL; /* Initialize curl handle */ self->handle = curl_easy_init(); if (self->handle == NULL) goto error; /* Set curl error buffer and zero it */ res = curl_easy_setopt(self->handle, CURLOPT_ERRORBUFFER, self->error); if (res != CURLE_OK) goto error; memset(self->error, 0, sizeof(self->error)); /* Set backreference */ res = curl_easy_setopt(self->handle, CURLOPT_PRIVATE, (char *) self); if (res != CURLE_OK) goto error; /* Enable NOPROGRESS by default, i.e. no progress output */ res = curl_easy_setopt(self->handle, CURLOPT_NOPROGRESS, (long)1); if (res != CURLE_OK) goto error; /* Disable VERBOSE by default, i.e. no verbose output */ res = curl_easy_setopt(self->handle, CURLOPT_VERBOSE, (long)0); if (res != CURLE_OK) goto error; /* Set FTP_ACCOUNT to NULL by default */ res = curl_easy_setopt(self->handle, CURLOPT_FTP_ACCOUNT, NULL); if (res != CURLE_OK) goto error; /* Set default USERAGENT */ s = (char *) malloc(7 + strlen(LIBCURL_VERSION) + 1); if (s == NULL) goto error; strcpy(s, "PycURL/"); strcpy(s+7, LIBCURL_VERSION); res = curl_easy_setopt(self->handle, CURLOPT_USERAGENT, (char *) s); /*主要在这里调用c的curl的curl_easy_setopt方法,生成一个CURLsessionhandler结构体*/ if (res != CURLE_OK) { free(s); goto error; } self->options[ OPT_INDEX(CURLOPT_USERAGENT) ] = s; s = NULL; /* Success - return new object */ return self; error: Py_DECREF(self); /* this also closes self->handle */ PyErr_SetString(ErrorObject, "initializing curl failed"); return NULL; }
2.设置参数
do_curl_setopt(CurlObject *self, PyObject *args) { int option; PyObject *obj; int res; if (!PyArg_ParseTuple(args, "iO:setopt", &option, &obj)) return NULL; if (check_curl_state(self, 1 | 2, "setopt") != 0) return NULL; /* early checks of option value */ if (option <= 0) goto error; if (option >= (int)CURLOPTTYPE_OFF_T + OPTIONS_SIZE) goto error; if (option % 10000 >= OPTIONS_SIZE) goto error; #if 0 /* XXX - should we ??? */ /* Handle the case of None */ if (obj == Py_None) { return util_curl_unsetopt(self, option); } #endif /* Handle the case of string arguments */ if (PyString_Check(obj)) { char *str = NULL; Py_ssize_t len = -1; char *buf; int opt_index; /* Check that the option specified a string as well as the input */ switch (option) { case CURLOPT_CAINFO: /*此处省略10000行,为pycurl未实现的curl的功能*/ case CURLOPT_CRLFILE: case CURLOPT_ISSUERCERT: /* FIXME: check if more of these options allow binary data */ str = PyString_AsString_NoNUL(obj); if (str == NULL) return NULL; break; case CURLOPT_POSTFIELDS: if (PyString_AsStringAndSize(obj, &str, &len) != 0) return NULL; /* automatically set POSTFIELDSIZE */ if (len <= INT_MAX) { res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE, (long)len); /*可以看到pycurl的设置参数也就是使用的c的curl的curl_easy_setopt,即是对C的curl的一种封装*/ } else { res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE_LARGE, (curl_off_t)len); } if (res != CURLE_OK) { CURLERROR_RETVAL(); } break; default: PyErr_SetString(PyExc_TypeError, "strings are not supported for this option"); return NULL; } /* Allocate memory to hold the string */ assert(str != NULL); if (len <= 0) buf = strdup(str); else { buf = (char *) malloc(len); if (buf) memcpy(buf, str, len); } if (buf == NULL) return PyErr_NoMemory(); /* Call setopt */ res = curl_easy_setopt(self->handle, (CURLoption)option, buf); /* Check for errors */ if (res != CURLE_OK) { free(buf); CURLERROR_RETVAL(); } /* Save allocated option buffer */ opt_index = OPT_INDEX(option); if (self->options[opt_index] != NULL) { free(self->options[opt_index]); self->options[opt_index] = NULL; } self->options[opt_index] = buf; Py_INCREF(Py_None); return Py_None; }
3.关闭连接,或者说是删除内存中对象。
static PyObject * do_curl_close(CurlObject *self) { if (check_curl_state(self, 2, "close") != 0) { return NULL; } util_curl_close(self); /*删除了CurlObject对象*/ Py_INCREF(Py_None); return Py_None; }
由以上分析可以看出,php的curl和python的curl都是对curl的一种封装,如果想写出一个更符合自己需求的配置型爬虫,可以考虑直接用C写,不过C的爬虫是不适合快速开发,这由代码量决定。
当然更好的建议是使用webkit做爬虫,作为部分浏览器内核,毋庸置疑。以后再说.