爬虫代码

import requests
from pyquery import PyQuery as pq
from multiprocessing import Pool
import xlwt

def search_single_page(page):
    url = '<http://10.22.1.18/Search/index?type=menu2&Class_ID=&country=&year=&sid=update&currentindex=>' + str(page)
    html = requests.get(url=url).text
    doc = pq(html)
    items = doc('body > div.section.list-box.clearfix > div.list-items > div.items > ul li').items()
    single_page_list = ['default' for x in range(30)]
    temp = 0
    count = page*30 - 29
    for item in items:
        single_item_list = ['序号','标题','url','简介']
        single_item_list[0] = count

        title = item.find('div.tit a').text().encode('utf-8').decode('utf-8')
        single_item_list[1]=title
        url = item.find('a').attr('href').encode('utf-8').decode('utf-8')
        single_item_list[2]=url
        intro = item.find('span.tip').text().encode('utf-8').decode('utf-8')
        single_item_list[3]=intro
        single_page_list[temp]=single_item_list
        #print(single_item_list)
        count = count+1
        temp = temp+1

    return [page,single_page_list]

def traverse():
    pool = Pool()
    pool_result = []
    for page in range(40):
        num = page+1
        r = pool.apply_async(search_single_page,(num,))
        pool_result.append(r)

    result_list = ['默认' for x in range(40)]
    print(pool_result)

    for r in pool_result:
        loc = r.get()[0]-1

        content = r.get()[1]
        print(loc,content)
        result_list[loc] = content

    total_result = ['default' for x in range(1200)]

    for page in range(40):
        for item_num in range(30):
            location = page * 30  + item_num
            total_result[location] = result_list[page][item_num]

    return total_result

def main():
    item_list = traverse()
    book = xlwt.Workbook(encoding='utf-8')
    movie = book.add_sheet(u'movie',cell_overwrite_ok=True)
    movie.write(0,0,'No.')
    movie.write(0,1,'Title')
    movie.write(0,2,'url')
    movie.write(0,3,'intro')
    for line in range(1200):
        for row in range(4):
            movie.write(line+1,row,item_list[line][row])
    book.save('LibOnlineDatabase.xls')
		print('copyright©番茄杀手')

if __name__ == '__main__':
    main()

片单

2021-1-14更新

LibOnlineDatabase.xls