import requests 
		from flask import json 
		from requests.exceptions import RequestException 
		import re 
		from multiprocessing import Pool 
		 
		''' 
		Request+正则表达式抓取猫眼电影 
		''' 
		 
		''' 
		获取第一页的内容 
		''' 
		def getOneContent(url,headers): 
		    try: 
		        response = requests.get(url,headers=headers) 
		        if response.status_code == 200: 
		            return response.text 
		        return None 
		    except RequestException: 
		        return None 
		 
		''' 
		解析内容,根据正则表达式 
		''' 
		def parserContent(content): 
		    if content: 
		        # pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>' 
		        #            +'.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?<dd>',re.S) 
		 
		        # 字符串换行不需要添加“+”,上面这种写法是错误的。 
		        pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>' 
		                             '.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?</dd>',re.S) 
		        results = re.findall(pattern,content) 
		        # print(results) 
		        return results 
		 
		def processData(results): 
		    for result in results: 
		        yield { 
		            'index':result[0], 
		            'imgurl':result[1], 
		            'name':result[2], 
		            'star':result[3].strip()[3:], 
		            'releasetime':result[4].strip()[5:], 
		            'score':result[5]+result[6] 
		        } 
		        # print(result) 
		 
		def storeData(data): 
		    ''' 
		    为了防止出现unicode码 
		    :param data: 需要写入文本的数据 
		    :return: 无返回值 
		    ''' 
		    with open("mmovie.txt",'a',encoding='utf-8') as f: 
		        f.write(json.dumps(data,ensure_ascii=False)+'\n') 
		        f.close() 
		 
		def main(offset): 
		    url = 'http://maoyan.com/board/4?offset='+str(offset) 
		    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'} 
		    html = getOneContent(url,headers=headers) 
		    # print(html) 
		    results = parserContent(html) 
		    for item in processData(results): 
		        storeData(item) 
		 
		if __name__ == '__main__': 
		    # for i in range(10): 
		    #     main(i*10) 
		    pool = Pool() 
		    pool.map(main,[i*10 for i in range(10)]) 
		 
		首发:传智播客人工智能+pathon培训学院 
		作者:http://python.itcast.cn/ | 
		 
	
	 
	 
	 
	 
	 |