具有多个接口的cURL,用于与代理的多个连接

我需要从一个网站上查看列表中的许多代理。 我决定用libcurl来做这件事。
我用这个例子并根据我的需要修改它。
这是我的代码:

#include  #include  #include  #include  #include  #include  /* somewhat unix-specific */ #include  #include  using namespace std; CURL * handles [100]; CURL * createProxyHandle (string proxyData){ CURL * handle = curl_easy_init (); curl_slist * chunk = NULL; chunk = curl_slist_append(chunk, "Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1"); chunk = curl_slist_append(chunk, "Accept-Language: ru-RU,ru;q=0.9,en;q=0.8"); chunk = curl_slist_append(chunk, "Accept-Encoding: gzip, deflate, sdch"); curl_easy_setopt (handle, CURLOPT_URL, ""); curl_easy_setopt (handle, CURLOPT_CONNECTTIMEOUT, 40); curl_easy_setopt (handle, CURLOPT_TIMEOUT, 50); curl_easy_setopt (handle, CURLOPT_FRESH_CONNECT, true); curl_easy_setopt (handle, CURLOPT_VERBOSE, true); curl_easy_setopt (handle, CURLOPT_FOLLOWLOCATION, true); curl_easy_setopt (handle, CURLOPT_ENCODING , "gzip"); curl_easy_setopt (handle, CURLOPT_PROXY, proxyData.c_str()); curl_easy_setopt (handle, CURLOPT_HTTPHEADER, chunk); curl_easy_setopt (handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36"); return handle; } int main(){ ifstream fin ("data.txt", ifstream::in); string proxy; CURLM *multi_handle; CURLMsg *msg; int msgs_left; int still_running; multi_handle = curl_multi_init(); while (fin >> proxy){ cout << "Proxy: " << proxy <= 0) { timeout.tv_sec = curl_timeo / 1000; if(timeout.tv_sec > 1) timeout.tv_sec = 1; else timeout.tv_usec = (curl_timeo % 1000) * 1000; } /* get file descriptors from the transfers */ mc = curl_multi_fdset(multi_handle, &fdread, &fdwrite, &fdexcep, &maxfd); if(mc != CURLM_OK) { fprintf(stderr, "curl_multi_fdset() failed, code %d.\n", mc); break; } /* On success the value of maxfd is guaranteed to be >= -1. We call select(maxfd + 1, ...); specially in case of (maxfd == -1) there are no fds ready yet so we call select(0, ...) --or Sleep() on Windows-- to sleep 100ms, which is the minimum suggested value in the curl_multi_fdset() doc. */ if(maxfd == -1) { #ifdef _WIN32 Sleep(100); rc = 0; #else /* Portable sleep for platforms other than Windows. */ struct timeval wait = { 0, 100 * 1000 }; /* 100ms */ rc = select(0, NULL, NULL, NULL, &wait); #endif } else { /* Note that on some platforms 'timeout' may be modified by select(). If you need access to the original value save a copy beforehand. */ rc = select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout); } switch(rc) { case -1: /* select error */ break; case 0: default: /* timeout or readable/writable sockets */ curl_multi_perform(multi_handle, &still_running); break; } } while(still_running); while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { if (msg->msg == CURLMSG_DONE) { printf("Finished with %d\n", msg->data.result); } } cout << "Completed" << endl; curl_multi_cleanup(multi_handle); return 0; } 

代理不可靠,但我在输出中看到的:

 Proxy: 69.12.64.105:8089 Proxy: 69.12.64.105:7808 Proxy: 210.245.20.170:80 Proxy: 190.74.165.109:8080 Proxy: 39.184.2.111:8123 Proxy: 190.201.166.37:8080 Proxy: 190.36.85.199:8080 Proxy: 92.255.231.54:8080 Proxy: 124.126.126.101:80 Proxy: 43.250.255.65:8080 Proxy: 69.12.64.106:7808 Proxy: 201.217.213.166:8080 Proxy: 178.169.90.188:8888 Proxy: 124.248.205.25:8080 Proxy: 39.190.82.133:8123 Proxy: 190.77.230.36:8080 * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 69.12.64.105... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 69.12.64.105... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 210.245.20.170... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 190.74.165.109... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 39.184.2.111... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 190.201.166.37... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 190.36.85.199... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 92.255.231.54... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 124.126.126.101... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 43.250.255.65... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 69.12.64.106... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 201.217.213.166... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 178.169.90.188... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 124.248.205.25... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 39.190.82.133... * Rebuilt URL to:  * Hostname was NOT found in DNS cache * Trying 190.77.230.36... * Connected to 69.12.64.105 (69.12.64.105) port 8089 (#0) * Establish HTTP proxy tunnel to :443 > CONNECT :443 HTTP/1.1 Host: :443 User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 Proxy-Connection: Keep-Alive Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 Accept-Encoding: gzip, deflate, sdch < HTTP/1.1 503 Service Unavailable < Server: squid/3.2.13 < Mime-Version: 1.0 < Date: Mon, 20 Apr 2015 23:00:24 GMT < Content-Type: text/html < Content-Length: 3694 < X-Squid-Error: ERR_DNS_FAIL 0 < * Received HTTP code 503 from proxy after CONNECT * Connected to 69.12.64.105 (69.12.64.105) port 7808 (#1) * Establish HTTP proxy tunnel to :443 > CONNECT :443 HTTP/1.1 Host: :443 User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 Proxy-Connection: Keep-Alive Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 Accept-Encoding: gzip, deflate, sdch < HTTP/1.1 503 Service Unavailable < Server: squid/3.2.13 < Mime-Version: 1.0 < Date: Mon, 20 Apr 2015 23:00:25 GMT < Content-Type: text/html < Content-Length: 3694 < X-Squid-Error: ERR_DNS_FAIL 0 < * Received HTTP code 503 from proxy after CONNECT * Connected to 43.250.255.65 (43.250.255.65) port 8080 (#9) * Establish HTTP proxy tunnel to :443 > CONNECT :443 HTTP/1.1 Host: :443 User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 Proxy-Connection: Keep-Alive Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 Accept-Encoding: gzip, deflate, sdch < HTTP/1.1 200 OK < * Proxy replied OK to CONNECT request * found 173 certificates in /etc/ssl/certs/ca-certificates.crt * Connected to 69.12.64.106 (69.12.64.106) port 7808 (#10) * Establish HTTP proxy tunnel to :443 > CONNECT :443 HTTP/1.1 Host: :443 User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 Proxy-Connection: Keep-Alive Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 Accept-Encoding: gzip, deflate, sdch < HTTP/1.1 200 Connection established < * Proxy replied OK to CONNECT request * found 173 certificates in /etc/ssl/certs/ca-certificates.crt * Connected to 190.77.230.36 (190.77.230.36) port 8080 (#15) * Establish HTTP proxy tunnel to :443 > CONNECT :443 HTTP/1.1 Host: :443 User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 Proxy-Connection: Keep-Alive Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 Accept-Encoding: gzip, deflate, sdch < HTTP/1.0 200 Connection established < Proxy-agent: tinyproxy/1.8.2 < * Proxy replied OK to CONNECT request * found 173 certificates in /etc/ssl/certs/ca-certificates.crt * Connected to 39.184.2.111 (39.184.2.111) port 8123 (#4) * Establish HTTP proxy tunnel to :443 > CONNECT :443 HTTP/1.1 Host: :443 User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36 Proxy-Connection: Keep-Alive Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1 Accept-Language: ru-RU,ru;q=0.9,en;q=0.8 Accept-Encoding: gzip, deflate, sdch * Proxy CONNECT aborted due to timeout * Connection time-out * Closing connection 5 * Connection time-out * Closing connection 6 * Connection time-out * Closing connection 7 * Connection time-out * Closing connection 8 * SSL connection timeout * Closing connection 9 * SSL connection timeout * Closing connection 10 * Connection time-out * Closing connection 11 * Connection time-out * Closing connection 12 * Connection time-out * Closing connection 13 * Connection time-out * Closing connection 14 * SSL connection timeout * Closing connection 15 * Connection timed out after 50056 milliseconds * Connection timed out after 50055 milliseconds Finished with 56 Finished with 56 Finished with 56 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Finished with 28 Completed 

在某些情况下(实际上很多代理都很糟糕,但并非所有代理都在这个列表中)curl从代理接收答案,发送标题,但仅此而已。 我测试了这个代理分开,他们没问题。
我无法想象curl multi发生了什么。

在此cURL 文档中 ,对多接口有一些限制。 我意识到我已经使用了这个限制function:

  • NSS SSL连接
  • HTTP代理CONNECT操作

作为这个问题的解决方案,我使用了与POSIX线程的curl easy接口,它运行良好。 这是我的解决方案,只是这个例子 ,其中包含一些用于安全multithreadingTLS使用的代码:

 #define USE_GNUTLS #include  #include  #include  /* we have this global to let the callback get easy access to it */ static pthread_mutex_t *lockarray; #ifdef USE_GNUTLS #include  #include  GCRY_THREAD_OPTION_PTHREAD_IMPL; void init_locks(void) { gcry_control(GCRYCTL_SET_THREAD_CBS); } #define kill_locks() #endif static void *pull_one_url(void *url) { FILE * file = fopen ("/dev/null", "w"); CURL * handle = curl_easy_init (); curl_slist * chunk = NULL; chunk = curl_slist_append(chunk, "Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1"); chunk = curl_slist_append(chunk, "Accept-Language: ru-RU,ru;q=0.9,en;q=0.8"); chunk = curl_slist_append(chunk, "Accept-Encoding: gzip, deflate, sdch"); curl_easy_setopt (handle, CURLOPT_URL, "https://www.avito.ru"); curl_easy_setopt (handle, CURLOPT_CONNECTTIMEOUT, 30); curl_easy_setopt (handle, CURLOPT_TIMEOUT, 30); curl_easy_setopt (handle, CURLOPT_FRESH_CONNECT, true); //curl_easy_setopt (handle, CURLOPT_VERBOSE, true); curl_easy_setopt (handle, CURLOPT_FOLLOWLOCATION, true); curl_easy_setopt (handle, CURLOPT_WRITEDATA, file); curl_easy_setopt (handle, CURLOPT_TCP_KEEPALIVE, 0L); curl_easy_setopt (handle, CURLOPT_ENCODING , "gzip"); curl_easy_setopt (handle, CURLOPT_PROXY, (const char*) url); curl_easy_setopt (handle, CURLOPT_HTTPHEADER, chunk); curl_easy_setopt (handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36"); CURLcode res = curl_easy_perform (handle); if (res != CURLE_OK){ printf ("Proxy %s failed with: %d (%s)\n", (const char*) url, res, curl_easy_strerror (res)); } else { long http_code = 0; curl_easy_getinfo (handle, CURLINFO_RESPONSE_CODE, &http_code); printf("Proxy %s finished with code: %d\n", (const char*) url, http_code); } curl_easy_cleanup (handle); return NULL; } const int NUMT = 21; const char * urls[] = { "69.12.64.105:8089", "69.12.64.105:7808", "210.245.20.170:80", "190.74.165.109:8080", "39.184.2.111:8123", "190.201.166.37:8080", "190.36.85.199:8080", "92.255.231.54:8080", "124.126.126.101:80", "43.250.255.65:8080", "69.12.64.106:7808", "201.217.213.166:8080", "178.169.90.188:8888", "124.248.205.25:8080", "39.190.82.133:8123", "190.77.230.36:8080", "201.243.204.230:8080", "190.201.58.26:8080", "178.166.155.36:8080", "183.221.188.66:8123", "207.66.105.37:24040", }; int main(int argc, char **argv) { pthread_t tid[NUMT]; int i; int error; (void)argc; /* we don't use any arguments in this example */ (void)argv; /* Must initialize libcurl before any threads are started */ curl_global_init(CURL_GLOBAL_ALL); init_locks(); for(i=0; i< NUMT; i++) { error = pthread_create(&tid[i], NULL, /* default attributes please */ pull_one_url, (void *)urls[i]); if(0 != error) fprintf(stderr, "Couldn't run thread number %d, errno %d\n", i, error); else fprintf(stderr, "Thread %d, gets %s\n", i, urls[i]); } /* now wait for all threads to terminate */ for(i=0; i< NUMT; i++) { error = pthread_join(tid[i], NULL); fprintf(stderr, "Thread %d terminated\n", i); } kill_locks(); return 0; }