作为参考例子,这个爬虫实现的非常简单,连数据库都没用到,如果缓存的任务满了,就直接丢弃后续新的任务,所以如果要在实际环境中使用,还需要加点其他逻辑代码,这边仅仅做个参考,所以完全简化了。

    针对重复url的检测,这边使用了bloom filter算法进行了优化,对html文档的url提取,都是直接放入线程池中来做。

    支持限速、指定下载目录、指定user-agent等选项设置。

    如何运行:

    ./spider http://www.xxxx.com

    设置下载目录:

    ./spider http://www.xxx.com -d=/tmp

    查看帮助:

    ./spider --help

    1. /* //////////////////////////////////////////////////////////////////////////////////////
    2. * includes
    3. */
    4. #include "tbox/tbox.h"
    5.  
    6. /* //////////////////////////////////////////////////////////////////////////////////////
    7. * macros
    8. */
    9.  
    10. // the spider url maxn
    11. #define TB_DEMO_SPIDER_URL_MAXN (4096)
    12.  
    13. // the spider task maxn
    14. #define TB_DEMO_SPIDER_TASK_MAXN (100)
    15.  
    16. // the spider task rate, 256KB/s
    17. #define TB_DEMO_SPIDER_TASK_RATE (256000)
    18.  
    19. // the spider task timeout, 15s
    20. #define TB_DEMO_SPIDER_TASK_TIMEOUT (15000)
    21.  
    22. // the spider filter maxn
    23. #define TB_DEMO_SPIDER_FILTER_MAXN (100000)
    24.  
    25. // the spider user agent
    26. #define TB_DEMO_SPIDER_USER_AGENT "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36"
    27.  
    28. /* //////////////////////////////////////////////////////////////////////////////////////
    29. * types
    30. */
    31.  
    32. // the demo spider type
    33. typedef struct __tb_demo_spider_t
    34. {
    35. // the pool
    36. tb_fixed_pool_ref_t pool;
    37.  
    38. // the lock
    39. tb_spinlock_t lock;
    40.  
    41. // the filter
    42. tb_bloom_filter_ref_t filter;
    43.  
    44. // the state
    45. tb_atomic_t state;
    46.  
    47. // the option
    48. tb_option_ref_t option;
    49.  
    50. // the home
    51. tb_char_t const* home;
    52.  
    53. // the root
    54. tb_char_t root[256];
    55.  
    56. // the timeout
    57. tb_long_t timeout;
    58.  
    59. // the user agent
    60. tb_char_t const* user_agent;
    61.  
    62. // the limited rate
    63. tb_size_t limited_rate;
    64.  
    65. }tb_demo_spider_t;
    66.  
    67. // the demo spider parser type
    68. typedef struct __tb_demo_spider_parser_t
    69. {
    70. // the stream
    71. tb_stream_ref_t stream;
    72.  
    73. // the reader
    74. tb_xml_reader_ref_t reader;
    75.  
    76. // the url
    77. tb_char_t url[8192];
    78.  
    79. }tb_demo_spider_parser_t;
    80.  
    81. // the demo spider task type
    82. typedef struct __tb_demo_spider_task_t
    83. {
    84. // the pool
    85. tb_demo_spider_t* spider;
    86.  
    87. // the iurl
    88. tb_char_t iurl[TB_DEMO_SPIDER_URL_MAXN];
    89.  
    90. // the ourl
    91. tb_char_t ourl[TB_DEMO_SPIDER_URL_MAXN];
    92.  
    93. }tb_demo_spider_task_t;
    94.  
    95. /* //////////////////////////////////////////////////////////////////////////////////////
    96. * globals
    97. */
    98. static tb_option_item_t g_options[] =
    99. {
    100. {'t', "timeout", TB_OPTION_MODE_KEY_VAL, TB_OPTION_TYPE_INTEGER, "set the timeout" }
    101. , {'d', "directory", TB_OPTION_MODE_KEY_VAL, TB_OPTION_TYPE_CSTR, "set the root directory" }
    102. , {'u', "agent", TB_OPTION_MODE_KEY_VAL, TB_OPTION_TYPE_CSTR, "set the user agent" }
    103. , {'r', "rate", TB_OPTION_MODE_KEY_VAL, TB_OPTION_TYPE_INTEGER, "set limited rate" }
    104. , {'h', "help", TB_OPTION_MODE_KEY, TB_OPTION_TYPE_BOOL, "display this help and exit" }
    105. , {'-', "home", TB_OPTION_MODE_VAL, TB_OPTION_TYPE_CSTR, "the home url" }
    106. , {'-', tb_null, TB_OPTION_MODE_END, TB_OPTION_TYPE_NONE, tb_null }
    107. };
    108.  
    109. /* //////////////////////////////////////////////////////////////////////////////////////
    110. * declaration
    111. */
    112. static tb_void_t tb_demo_spider_task_exit(tb_demo_spider_task_t* task);
    113. static tb_bool_t tb_demo_spider_task_done(tb_demo_spider_t* spider, tb_char_t const* url, tb_bool_t html, tb_bool_t* full);
    114.  
    115. /* //////////////////////////////////////////////////////////////////////////////////////
    116. * implementation
    117. */
    118. static tb_bool_t tb_demo_spider_parser_open_html(tb_stream_ref_t stream, tb_char_t const* url)
    119. {
    120. // check
    121. tb_assert_and_check_return_val(stream && url, tb_false);
    122.  
    123. // done
    124. tb_bool_t ok = tb_false;
    125. do
    126. {
    127. // the file path contains /html/?
    128. if (!tb_strstr(url, "html")) break;
    129.  
    130. // ctrl stream
    131. if (!tb_stream_ctrl(stream, TB_STREAM_CTRL_SET_URL, url)) break;
    132.  
    133. // open stream
    134. if (!tb_stream_open(stream)) break;
    135.  
    136. // the stream size
    137. tb_hong_t size = tb_stream_size(stream);
    138. tb_check_break(size);
    139.  
    140. // prefetch some data
    141. tb_byte_t* data = tb_null;
    142. tb_size_t need = tb_min((tb_size_t)size, 256);
    143. if (!tb_stream_need(stream, &data, need)) break;
    144.  
    145. // is html?
    146. if (tb_strnistr((tb_char_t const*)data, need, "<!DOCTYPE html>"))
    147. {
    148. ok = tb_true;
    149. break;
    150. }
    151.  
    152. // is html?
    153. ok = tb_strnistr((tb_char_t const*)data, need, "<html")? tb_true : tb_false;
    154.  
    155. } while (0);
    156.  
    157. // failed?
    158. if (!ok)
    159. {
    160. // clos stream
    161. if (stream) tb_stream_clos(stream);
    162. }
    163.  
    164. // ok?
    165. return ok;
    166. }
    167. static tb_size_t tb_demo_spider_parser_get_url(tb_xml_reader_ref_t reader, tb_char_t* data, tb_size_t maxn, tb_bool_t* html)
    168. {
    169. // check
    170. tb_assert_and_check_return_val(reader && data && maxn && html, tb_false);
    171.  
    172. // walk
    173. tb_size_t ok = 0;
    174. tb_size_t event = TB_XML_READER_EVENT_NONE;
    175. while (!ok && (event = tb_xml_reader_next(reader)))
    176. {
    177. switch (event)
    178. {
    179. case TB_XML_READER_EVENT_ELEMENT_EMPTY:
    180. case TB_XML_READER_EVENT_ELEMENT_BEG:
    181. {
    182. // the element name
    183. tb_char_t const* name = tb_xml_reader_element(reader);
    184. tb_check_break(name);
    185.  
    186. // <a href="" />?
    187. // <link href="" />
    188. // <img src="" />?
    189. // <script src="" />?
    190. // <source src="" />?
    191. // <frame src="" />?
    192. if ( !tb_stricmp(name, "a")
    193. || !tb_stricmp(name, "link")
    194. || !tb_stricmp(name, "img")
    195. || !tb_stricmp(name, "frame")
    196. || !tb_stricmp(name, "source"))
    197. {
    198. // walk attributes
    199. tb_xml_node_ref_t attr = (tb_xml_node_ref_t)tb_xml_reader_attributes(reader);
    200. for (; attr; attr = attr->next)
    201. {
    202. // href or src?
    203. if ( tb_string_size(&attr->data) > 8
    204. && ( !tb_string_cstricmp(&attr->name, "href")
    205. || !tb_string_cstricmp(&attr->name, "src"))
    206. && ( !tb_string_cstrnicmp(&attr->data, "http://", 7)
    207. || !tb_string_cstrnicmp(&attr->data, "https://", 8)))
    208. {
    209. // copy
    210. tb_strlcpy(data, tb_string_cstr(&attr->data), maxn);
    211.  
    212. // ok
    213. ok = tb_string_size(&attr->data);
    214.  
    215. // no html?
    216. if (!tb_stricmp(name, "img") || !tb_stricmp(name, "source"))
    217. *html = tb_false;
    218. else if ( ok > 4
    219. && ( !tb_stricmp(data + ok - 4, ".css")
    220. || !tb_stricmp(data + ok - 4, ".png")
    221. || !tb_stricmp(data + ok - 4, ".jpg")
    222. || !tb_stricmp(data + ok - 4, ".gif")
    223. || !tb_stricmp(data + ok - 4, ".rar")
    224. || !tb_stricmp(data + ok - 4, ".zip")))
    225. {
    226. *html = tb_false;
    227. }
    228. else if ( ok > 3
    229. && ( !tb_stricmp(data + ok - 4, ".js")
    230. || !tb_stricmp(data + ok - 4, ".gz")))
    231. {
    232. *html = tb_false;
    233. }
    234. }
    235. }
    236. }
    237. }
    238. break;
    239. default:
    240. break;
    241. }
    242. }
    243.  
    244. // end
    245. data[maxn - 1] = '\0';
    246.  
    247. // ok?
    248. return ok;
    249. }
    250. static tb_void_t tb_demo_spider_parser_exit(tb_thread_pool_worker_ref_t worker, tb_cpointer_t priv)
    251. {
    252. // check
    253. tb_demo_spider_parser_t* parser = (tb_demo_spider_parser_t*)priv;
    254. tb_assert_and_check_return(parser);
    255.  
    256. // exit stream
    257. if (parser->stream) tb_stream_exit(parser->stream);
    258. parser->stream = tb_null;
    259.  
    260. // exit reader
    261. if (parser->reader) tb_xml_reader_exit(parser->reader);
    262. parser->reader = tb_null;
    263.  
    264. // exit it
    265. tb_free(parser);
    266. }
    267. static tb_demo_spider_parser_t* tb_demo_spider_parser_init(tb_thread_pool_worker_ref_t worker)
    268. {
    269. // check
    270. tb_assert_and_check_return_val(worker, tb_null);
    271.  
    272. // done
    273. tb_bool_t ok = tb_false;
    274. tb_demo_spider_parser_t* parser = tb_null;
    275. do
    276. {
    277. // attempt to get the parser
    278. parser = (tb_demo_spider_parser_t*)tb_thread_pool_worker_getp(worker, 0);
    279. if (!parser)
    280. {
    281. // make parser
    282. parser = tb_malloc0_type(tb_demo_spider_parser_t);
    283. tb_assert_and_check_break(parser);
    284.  
    285. // save parser
    286. tb_thread_pool_worker_setp(worker, 0, tb_demo_spider_parser_exit, (tb_cpointer_t)parser);
    287.  
    288. // init stream
    289. parser->stream = tb_stream_init_file();
    290. tb_assert_and_check_break(parser->stream);
    291.  
    292. // init reader
    293. parser->reader = tb_xml_reader_init();
    294. tb_assert_and_check_break(parser->reader);
    295. }
    296.  
    297. // ok
    298. ok = tb_true;
    299.  
    300. } while (0);
    301.  
    302. // failed?
    303. if (!ok)
    304. {
    305. // exit it
    306. if (parser) tb_demo_spider_parser_exit(worker, (tb_cpointer_t)parser);
    307. parser = tb_null;
    308. }
    309.  
    310. // ok
    311. return parser;
    312. }
    313. static tb_void_t tb_demo_spider_parser_task_done(tb_thread_pool_worker_ref_t worker, tb_cpointer_t priv)
    314. {
    315. // check
    316. tb_demo_spider_task_t* task = (tb_demo_spider_task_t*)priv;
    317. tb_assert_and_check_return(worker && task && task->spider);
    318.  
    319. // init parser
    320. tb_demo_spider_parser_t* parser = tb_demo_spider_parser_init(worker);
    321. tb_assert_and_check_return(parser && parser->stream && parser->reader);
    322.  
    323. // open stream
    324. if (tb_demo_spider_parser_open_html(parser->stream, task->ourl))
    325. {
    326. // open reader
    327. if (tb_xml_reader_open(parser->reader, parser->stream, tb_false))
    328. {
    329. // parse url
    330. tb_bool_t html = tb_true;
    331. while ( TB_STATE_OK == tb_atomic_get(&task->spider->state)
    332. && tb_demo_spider_parser_get_url(parser->reader, parser->url, sizeof(parser->url) - 1, &html))
    333. {
    334. // trace
    335. tb_trace_d("parser: done: %s => %s", task->iurl, parser->url);
    336.  
    337. // done
    338. tb_bool_t full = tb_false;
    339. if (!tb_demo_spider_task_done(task->spider, parser->url, html, &full) && full) break;
    340.  
    341. // reset html
    342. html = tb_true;
    343. }
    344.  
    345. // clos reader
    346. tb_xml_reader_clos(parser->reader);
    347. }
    348.  
    349. // clos stream
    350. tb_stream_clos(parser->stream);
    351. }
    352. }
    353. static tb_void_t tb_demo_spider_parser_task_exit(tb_thread_pool_worker_ref_t worker, tb_cpointer_t priv)
    354. {
    355. // check
    356. tb_demo_spider_task_t* task = (tb_demo_spider_task_t*)priv;
    357. tb_assert_and_check_return(worker && task);
    358.  
    359. // exit task
    360. tb_demo_spider_task_exit(task);
    361. }
    362. static tb_bool_t tb_demo_spider_make_ourl(tb_demo_spider_t* spider, tb_char_t const* url, tb_char_t* data, tb_size_t maxn, tb_bool_t html)
    363. {
    364. // check
    365. tb_assert_and_check_return_val(spider && url && data && maxn, tb_false);
    366.  
    367. // skip protocol
    368. tb_char_t* p = (tb_char_t*)url;
    369. tb_char_t* e = (tb_char_t*)url + tb_strlen(url);
    370. if (!tb_strnicmp(p, "http://", 7)) p += 7;
    371. else if (!tb_strnicmp(p, "https://", 8)) p += 8;
    372. tb_assert_and_check_return_val(p < e, tb_false);
    373.  
    374. // find suffix
    375. tb_char_t suffix[64] = {0};
    376. {
    377. tb_char_t* f = e - 1;
    378. while (f >= p && *f != '.') f--;
    379. if (f >= p && *f == '.')
    380. {
    381. f++;
    382. tb_size_t i = 0;
    383. while (f < e && tb_isalpha(*f) && i < 64) suffix[i++] = *f++;
    384. }
    385. }
    386.  
    387. // make md5
    388. tb_byte_t md5_data[16];
    389. tb_size_t md5_size = tb_md5_encode((tb_byte_t const*)p, e - p, md5_data, 16);
    390. tb_assert_and_check_return_val(md5_size == 16, tb_false);
    391.  
    392. // append root
    393. p = data;
    394. e = data + maxn - 1;
    395. if (p < e) p += tb_snprintf(p, e - p, "%s/%s/", spider->root, html? "html" : "other");
    396.  
    397. // append md5
    398. tb_size_t i = 0;
    399. for (i = 0; i < 16 && p < e; ++i) p += tb_snprintf(p, e - p, "%02X", md5_data[i]);
    400. tb_assert_and_check_return_val(p < e, tb_false);
    401.  
    402. // append suffix
    403. if (p < e) p += tb_snprintf(p, e - p, ".%s", suffix[0]? suffix : (html? "html" : "other"));
    404.  
    405. // end
    406. *p = '\0';
    407.  
    408. // trace
    409. tb_trace_d("make: %s => %s", url, data);
    410.  
    411. // ok?
    412. return i == 16? tb_true : tb_false;
    413. }
    414. static tb_void_t tb_demo_spider_task_exit(tb_demo_spider_task_t* task)
    415. {
    416. // check
    417. tb_assert_and_check_return(task);
    418.  
    419. // the spider
    420. tb_demo_spider_t* spider = task->spider;
    421. tb_assert_and_check_return(spider);
    422.  
    423. // trace
    424. tb_trace_d("task: exit: %s", task->iurl);
    425.  
    426. // enter
    427. tb_spinlock_enter(&spider->lock);
    428.  
    429. // exit task
    430. if (spider->pool) tb_fixed_pool_free(spider->pool, task);
    431.  
    432. // leave
    433. tb_spinlock_leave(&spider->lock);
    434. }
    435. static tb_bool_t tb_demo_spider_task_save(tb_size_t state, tb_hize_t offset, tb_hong_t size, tb_hize_t save, tb_size_t rate, tb_cpointer_t priv)
    436. {
    437. // check
    438. tb_demo_spider_task_t* task = (tb_demo_spider_task_t*)priv;
    439. tb_assert_and_check_return_val(task && task->spider, tb_false);
    440.  
    441. // percent
    442. #ifdef TB_TRACE_DEBUG
    443. tb_size_t percent = 0;
    444. if (size > 0) percent = (tb_size_t)((offset * 100) / size);
    445. else if (state == TB_STATE_OK) percent = 100;
    446.  
    447. // trace
    448. tb_trace_d("save[%s]: %llu, rate: %lu bytes/s, percent: %lu%%, state: %s", task->iurl, save, rate, percent, tb_state_cstr(state));
    449. #endif
    450.  
    451. // ok? continue it
    452. tb_bool_t ok = tb_false;
    453. if (state == TB_STATE_OK) ok = tb_true;
    454. // closed?
    455. else if (state == TB_STATE_CLOSED && TB_STATE_OK == tb_atomic_get(&task->spider->state))
    456. {
    457. // trace
    458. tb_trace_i("task: done: %s: ok", task->iurl);
    459.  
    460. // post parser task
    461. tb_thread_pool_task_post(tb_thread_pool(), "parser_task", tb_demo_spider_parser_task_done, tb_demo_spider_parser_task_exit, task, tb_false);
    462. }
    463. // failed or killed?
    464. else
    465. {
    466. // trace
    467. tb_trace_e("task: done: %s: %s", task->iurl, tb_state_cstr(state));
    468.  
    469. // exit task
    470. tb_demo_spider_task_exit(task);
    471. }
    472.  
    473. // break or continue?
    474. return ok;
    475. }
    476. static tb_bool_t tb_demo_spider_task_ctrl(tb_async_stream_ref_t istream, tb_async_stream_ref_t ostream, tb_cpointer_t priv)
    477. {
    478. // check
    479. tb_demo_spider_task_t* task = (tb_demo_spider_task_t*)priv;
    480. tb_assert_and_check_return_val(task && task->spider, tb_false);
    481. tb_assert_and_check_return_val(istream && ostream, tb_false);
    482. tb_assert_and_check_return_val(tb_async_stream_type(istream) == TB_STREAM_TYPE_HTTP, tb_false);
    483.  
    484. // the url
    485. tb_char_t const* url = tb_null;
    486. if (!tb_async_stream_ctrl(istream, TB_STREAM_CTRL_GET_URL, &url)) return tb_false;
    487.  
    488. // trace
    489. tb_trace_d("ctrl: %s: ..", url);
    490.  
    491. // set timeout
    492. if (!tb_async_stream_ctrl(istream, TB_STREAM_CTRL_SET_TIMEOUT, task->spider->timeout)) return tb_false;
    493.  
    494. // need gzip
    495. if (!tb_async_stream_ctrl(istream, TB_STREAM_CTRL_HTTP_SET_HEAD, "Accept-Encoding", "gzip,deflate")) return tb_false;
    496.  
    497. // auto unzip
    498. if (!tb_async_stream_ctrl(istream, TB_STREAM_CTRL_HTTP_SET_AUTO_UNZIP, 1)) return tb_false;
    499.  
    500. // user agent
    501. if (!tb_async_stream_ctrl(istream, TB_STREAM_CTRL_HTTP_SET_HEAD, "User-Agent", task->spider->user_agent)) return tb_false;
    502.  
    503. // enable cookies
    504. if (!tb_async_stream_ctrl(istream, TB_STREAM_CTRL_HTTP_SET_COOKIES, tb_cookies())) return tb_false;
    505.  
    506. // ok
    507. return tb_true;
    508. }
    509. static tb_bool_t tb_demo_spider_task_done(tb_demo_spider_t* spider, tb_char_t const* url, tb_bool_t html, tb_bool_t* full)
    510. {
    511. // check
    512. tb_assert_and_check_return_val(spider && url, tb_false);
    513.  
    514. // killed?
    515. tb_check_return_val(TB_STATE_OK == tb_atomic_get(&spider->state), tb_false);
    516.  
    517. // enter
    518. tb_spinlock_enter(&spider->lock);
    519.  
    520. // done
    521. tb_bool_t ok = tb_false;
    522. tb_size_t size = 0;
    523. tb_demo_spider_task_t* task = tb_null;
    524. tb_bool_t repeat = tb_false;
    525. do
    526. {
    527. // check
    528. tb_assert_and_check_break(spider->filter && spider->pool);
    529.  
    530. // the task count
    531. size = tb_fixed_pool_size(spider->pool);
    532.  
    533. // have been done already?
    534. if (!tb_bloom_filter_set(spider->filter, url))
    535. {
    536. // trace
    537. tb_trace_d("task: size: %lu, done: %s: repeat", size, url);
    538. ok = tb_true;
    539. repeat = tb_true;
    540. break;
    541. }
    542.  
    543. // trace
    544. tb_trace_d("task: size: %lu, done: %s: ..", size, url);
    545.  
    546. // full?
    547. tb_check_break(size < TB_DEMO_SPIDER_TASK_MAXN);
    548.  
    549. // make task
    550. task = (tb_demo_spider_task_t*)tb_fixed_pool_malloc0(spider->pool);
    551. tb_assert_and_check_break(task);
    552.  
    553. // init task
    554. task->spider = spider;
    555. tb_strlcpy(task->iurl, url, sizeof(task->iurl) - 1);
    556. if (!tb_demo_spider_make_ourl(spider, url, task->ourl, sizeof(task->ourl) - 1, html)) break;
    557.  
    558. // ok
    559. ok = tb_true;
    560.  
    561. } while (0);
    562.  
    563. // leave
    564. tb_spinlock_leave(&spider->lock);
    565.  
    566. // failed?
    567. if (!ok)
    568. {
    569. // exit task
    570. if (task) tb_demo_spider_task_exit(task);
    571. task = tb_null;
    572. }
    573.  
    574. // ok? done task
    575. if (ok && !repeat && TB_STATE_OK == tb_atomic_get(&spider->state)) ok = task? tb_transfer_pool_done(tb_transfer_pool(), url, task->ourl, 0, spider->limited_rate, tb_demo_spider_task_save, tb_demo_spider_task_ctrl, task) : tb_false;
    576.  
    577. // failed?
    578. if (!ok && size < TB_DEMO_SPIDER_TASK_MAXN)
    579. {
    580. // trace
    581. tb_trace_e("task: size: %lu, done: %s: post failed", size, url);
    582. }
    583.  
    584. // save full
    585. if (full) *full = size < TB_DEMO_SPIDER_TASK_MAXN? tb_false : tb_true;
    586.  
    587. // ok?
    588. return ok;
    589. }
    590. static tb_bool_t tb_demo_spider_init(tb_demo_spider_t* spider, tb_int_t argc, tb_char_t** argv)
    591. {
    592. // check
    593. tb_assert_and_check_return_val(spider && argc && argv, tb_false);
    594.  
    595. // done
    596. tb_bool_t ok = tb_false;
    597. do
    598. {
    599. // init option
    600. spider->option = tb_option_init("spider", "the spider demo", g_options);
    601. tb_assert_and_check_break(spider->option);
    602.  
    603. // done option
    604. if (!tb_option_done(spider->option, argc - 1, &argv[1])) break;
    605.  
    606. // init home
    607. spider->home = tb_option_item_cstr(spider->option, "home");
    608. tb_assert_and_check_break(spider->home);
    609. tb_trace_d("home: %s", spider->home);
    610.  
    611. // init root
    612. tb_char_t const* root = tb_option_item_cstr(spider->option, "directory");
    613.  
    614. // init user agent
    615. spider->user_agent = tb_option_item_cstr(spider->option, "agent");
    616.  
    617. // init timeout
    618. if (tb_option_find(spider->option, "timeout"))
    619. spider->timeout = tb_option_item_sint32(spider->option, "timeout");
    620.  
    621. // init limited rate
    622. if (tb_option_find(spider->option, "rate"))
    623. spider->limited_rate = tb_option_item_uint32(spider->option, "rate");
    624.  
    625. // using the default root
    626. if (root) tb_strlcpy(spider->root, root, sizeof(spider->root) - 1);
    627. else
    628. {
    629. // the temporary root
    630. tb_directory_temp(spider->root, sizeof(spider->root) - 1);
    631.  
    632. // append spider
    633. tb_strcat(spider->root, "/spider");
    634. }
    635. tb_trace_d("root: %s", spider->root);
    636.  
    637. // using the default user agent
    638. if (!spider->user_agent) spider->user_agent = TB_DEMO_SPIDER_USER_AGENT;
    639.  
    640. // using the default timeout
    641. if (!spider->timeout) spider->timeout = TB_DEMO_SPIDER_TASK_TIMEOUT;
    642.  
    643. // using the default rate
    644. if (!spider->limited_rate) spider->limited_rate = TB_DEMO_SPIDER_TASK_RATE;
    645.  
    646. // strip root tail: '/' or '\\'
    647. tb_size_t size = tb_strlen(spider->root);
    648. if (size && (spider->root[size - 1] == '/' || spider->root[size - 1] == '\\')) spider->root[size - 1] = '\0';
    649.  
    650. // init state
    651. spider->state = TB_STATE_OK;
    652.  
    653. // init lock
    654. if (!tb_spinlock_init(&spider->lock)) break;
    655.  
    656. // init pool
    657. spider->pool = tb_fixed_pool_init(tb_null, TB_DEMO_SPIDER_TASK_MAXN >> 2, sizeof(tb_demo_spider_task_t), tb_null, tb_null, tb_null);
    658. tb_assert_and_check_break(spider->pool);
    659.  
    660. // init filter
    661. spider->filter = tb_bloom_filter_init(TB_BLOOM_FILTER_PROBABILITY_0_001, 3, TB_DEMO_SPIDER_FILTER_MAXN, tb_item_func_str(tb_true));
    662. tb_assert_and_check_break(spider->filter);
    663.  
    664. // ok
    665. ok = tb_true;
    666.  
    667. } while (0);
    668.  
    669. // failed? help it
    670. if (!ok && spider->option) tb_option_help(spider->option);
    671.  
    672. // ok?
    673. return ok;
    674. }
    675. static tb_void_t tb_demo_spider_exit(tb_demo_spider_t* spider)
    676. {
    677. // check
    678. tb_assert_and_check_return(spider);
    679.  
    680. // trace
    681. tb_trace_d("exit: ..");
    682.  
    683. // kill it
    684. tb_atomic_set(&spider->state, TB_STATE_KILLING);
    685.  
    686. // kill all transfer tasks
    687. tb_transfer_pool_kill_all(tb_transfer_pool());
    688.  
    689. // kill all parser tasks
    690. tb_thread_pool_task_kill_all(tb_thread_pool());
    691.  
    692. // wait all transfer tasks exiting
    693. tb_transfer_pool_wait_all(tb_transfer_pool(), -1);
    694.  
    695. // wait all parser tasks exiting
    696. tb_thread_pool_task_wait_all(tb_thread_pool(), -1);
    697.  
    698. // enter
    699. tb_spinlock_enter(&spider->lock);
    700.  
    701. // exit filter
    702. if (spider->filter) tb_bloom_filter_exit(spider->filter);
    703. spider->filter = tb_null;
    704.  
    705. // exit pool
    706. if (spider->pool) tb_fixed_pool_exit(spider->pool);
    707. spider->pool = tb_null;
    708.  
    709. // leave
    710. tb_spinlock_leave(&spider->lock);
    711.  
    712. // exit lock
    713. tb_spinlock_exit(&spider->lock);
    714.  
    715. // exit option
    716. if (spider->option) tb_option_exit(spider->option);
    717. spider->option = tb_null;
    718.  
    719. // trace
    720. tb_trace_d("exit: ok");
    721. }
    722.  
    723. /* //////////////////////////////////////////////////////////////////////////////////////
    724. * main
    725. */
    726. tb_int_t main(tb_int_t argc, tb_char_t** argv)
    727. {
    728. // init tbox
    729. if (!tb_init(tb_null, tb_null, 0)) return 0;
    730.  
    731. // done
    732. tb_demo_spider_t spider = {0};
    733. do
    734. {
    735. // init spider
    736. if (!tb_demo_spider_init(&spider, argc, argv)) break;
    737.  
    738. // done the home task if exists
    739. tb_demo_spider_task_done(&spider, spider.home, tb_true, tb_null);
    740.  
    741. // wait
    742. getchar();
    743.  
    744. } while (0);
    745.  
    746. // exit spider
    747. tb_demo_spider_exit(&spider);
    748.  
    749. // exit tbox
    750. tb_exit();
    751. return 0;
    752. }