用多進(jìn)程爬取資料寫(xiě)入文件,運(yùn)行沒(méi)有報(bào)錯(cuò),但是開(kāi)啟文件卻亂碼。
##用多執(zhí)行緒改寫(xiě)時(shí)卻沒(méi)有這個(gè)問(wèn)題,一切正常。
下面是資料寫(xiě)入檔案的程式碼:
def Get_urls(start_page,end_page):
print ' run task {} ({})'.format(start_page,os.getpid())
url_text = codecs.open('url.txt','a','utf-8')
for i in range(start_page,end_page+1):
pageurl=baseurl1+str(i)+baseurl2+searchword
response = requests.get(pageurl, headers=header)
soup = BeautifulSoup(response.content, 'html.parser')
a_list=soup.find_all('a')
for a in a_list:
if a.text!=''and 'wssd_content.jsp?bookid'in a['href']:
text=a.text.strip()
url=baseurl+str(a['href'])
url_text.write(text+'\t'+url+'\n')
url_text.close()
多進(jìn)程用的進(jìn)程池
def Multiple_processes_test():
t1 = time.time()
print 'parent process {} '.format(os.getpid())
page_ranges_list = [(1,3),(4,6),(7,9)]
pool = multiprocessing.Pool(processes=3)
for page_range in page_ranges_list:
pool.apply_async(func=Get_urls,args=(page_range[0],page_range[1]))
pool.close()
pool.join()
t2 = time.time()
print '時(shí)間:',t2-t1
圖片上已經(jīng)說(shuō)了,檔案以錯(cuò)誤的編碼形式載入了,說(shuō)明你多進(jìn)程寫(xiě)入的時(shí)候,編碼不是utf-8
開(kāi)啟同一個(gè)檔案,相當(dāng)危險(xiǎn),出錯(cuò)機(jī)率相當(dāng)大,
多執(zhí)行緒不出錯(cuò),極有可能是GIL,
多進(jìn)程沒(méi)有鎖,因此容易出錯(cuò)了。
url_text = codecs.open('url.txt','a','utf-8')
建議改為生產(chǎn)者消費(fèi)都模式!
比如這樣
# -*- coding: utf-8 -* -
import time
import os
import codecs
import multiprocessing
import requests
from bs4 import BeautifulSoup
baseurl = ''
baseurl1 = ''
baseurl2 = ''
pageurl = ''
searchword = ''
header = {}
def fake(url, **kwargs):
class Response(object):
pass
o = Response()
o.content = '<a href="/{}/wssd_content.jsp?bookid">foo</a>'.format(url)
return o
requests.get = fake
def Get_urls(start_page, end_page, queue):
print('run task {} ({})'.format(start_page, os.getpid()))
try:
for i in range(start_page, end_page + 1):
pageurl = baseurl1 + str(i) + baseurl2 + searchword
response = requests.get(pageurl, headers=header)
soup = BeautifulSoup(response.content, 'html.parser')
a_list = soup.find_all('a')
for a in a_list:
if a.text != ''and 'wssd_content.jsp?bookid'in a['href']:
text = a.text.strip()
url = baseurl + str(a['href'])
queue.put(text + '\t' + url + '\n')
except Exception as e:
import traceback
traceback.print_exc()
def write_file(queue):
print("start write file")
url_text = codecs.open('url.txt', 'a', 'utf-8')
while True:
line = queue.get()
if line is None:
break
print("write {}".format(line))
url_text.write(line)
url_text.close()
def Multiple_processes_test():
t1 = time.time()
manager = multiprocessing.Manager()
queue = manager.Queue()
print 'parent process {} '.format(os.getpid())
page_ranges_list = [(1, 3), (4, 6), (7, 9)]
consumer = multiprocessing.Process(target=write_file, args=(queue,))
consumer.start()
pool = multiprocessing.Pool(processes=3)
results = []
for page_range in page_ranges_list:
result = pool.apply_async(func=Get_urls,
args=(page_range[0],
page_range[1],
queue
))
results.append(result)
pool.close()
pool.join()
queue.put(None)
consumer.join()
t2 = time.time()
print '時(shí)間:', t2 - t1
if __name__ == '__main__':
Multiple_processes_test()
foo /4/wssd_content.jsp?bookid
foo /5/wssd_content.jsp?bookid
foo /6/wssd_content.jsp?bookid
foo /1/wssd_content.jsp?bookid
f /2/2/片
foo /3/wssd_content.jsp?bookid
foo /7/wssd_content.jsp?bookid
foo /8/wssd_content.jsp?bookid
foo /9/wssd_content.jsp?bookid