import requests
from pyquery import PyQuery as pq
from multiprocessing import Pool
import xlwt
def search_single_page(page):
url = '<http://10.22.1.18/Search/index?type=menu2&Class_ID=&country=&year=&sid=update¤tindex=>' + str(page)
html = requests.get(url=url).text
doc = pq(html)
items = doc('body > div.section.list-box.clearfix > div.list-items > div.items > ul li').items()
single_page_list = ['default' for x in range(30)]
temp = 0
count = page*30 - 29
for item in items:
single_item_list = ['序号','标题','url','简介']
single_item_list[0] = count
title = item.find('div.tit a').text().encode('utf-8').decode('utf-8')
single_item_list[1]=title
url = item.find('a').attr('href').encode('utf-8').decode('utf-8')
single_item_list[2]=url
intro = item.find('span.tip').text().encode('utf-8').decode('utf-8')
single_item_list[3]=intro
single_page_list[temp]=single_item_list
#print(single_item_list)
count = count+1
temp = temp+1
return [page,single_page_list]
def traverse():
pool = Pool()
pool_result = []
for page in range(40):
num = page+1
r = pool.apply_async(search_single_page,(num,))
pool_result.append(r)
result_list = ['默认' for x in range(40)]
print(pool_result)
for r in pool_result:
loc = r.get()[0]-1
content = r.get()[1]
print(loc,content)
result_list[loc] = content
total_result = ['default' for x in range(1200)]
for page in range(40):
for item_num in range(30):
location = page * 30 + item_num
total_result[location] = result_list[page][item_num]
return total_result
def main():
item_list = traverse()
book = xlwt.Workbook(encoding='utf-8')
movie = book.add_sheet(u'movie',cell_overwrite_ok=True)
movie.write(0,0,'No.')
movie.write(0,1,'Title')
movie.write(0,2,'url')
movie.write(0,3,'intro')
for line in range(1200):
for row in range(4):
movie.write(line+1,row,item_list[line][row])
book.save('LibOnlineDatabase.xls')
print('copyright©番茄杀手')
if __name__ == '__main__':
main()
2021-1-14更新