- 資訊首頁(yè) > 開(kāi)發(fā)技術(shù) > web開(kāi)發(fā) >
- 北京、天津工廠(chǎng)自動(dòng)監測數據爬取
&&&&&
# -*- coding: utf-8 -* from bs4 import BeautifulSoup import requests from xlwt import Workbook import time import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') def shuju(url,date,page): """ 提取指定公司,指定日期、指定頁(yè)面的自動(dòng)監測數據 """ fromdata = { "startTime":date, "pageIndex":page, } try: response = requests.post(url, data=fromdata) except: time.sleep(2) response = requests.post(url, data=fromdata) soup = BeautifulSoup(response.text, 'lxml') shujulist = soup.find_all('tr') datalist = [] for shuju in shujulist: try: linedata = shuju.find_all('td') lin01 = linedata[0].text.split()[0] lin02 = linedata[1].text.split()[0] lin03 = linedata[2].text.split()[0] lin04 = linedata[3].text.split()[0] lin05 = linedata[4].text.split()[0] lin06 = linedata[5].text.split()[0] lin07 = linedata[6].text.split()[0] lin08 = linedata[7].text.split()[0] try: lin09 = linedata[8].text.split()[0] except: lin09 = '' lin10 = linedata[9].text.split()[0] lin11 = linedata[10].text.split()[0] lin12 = linedata[11].text.split()[0] try: lin13 = linedata[12].text.split()[0] except: lin13 = '' data = [lin01,lin02,lin03,lin04,lin05,lin06,lin07,lin08,lin09,lin10,lin11,lin12,lin13] datalist.append(data) except: pass return datalist def pageNumber(url,date): """ 返回公司指定日期自動(dòng)監測數據的頁(yè)數,便于for循環(huán)遍歷 """ fromdata = { "startTime":date, "pageIndex":"", } try: response = requests.post(url, data=fromdata) except: time.sleep(1) response = requests.post(url, data=fromdata) soup = BeautifulSoup(response.text,'lxml') number = soup.find('span',class_="clr_b ver_mid").string.split('/')[1][0] compname = soup.find('div',class_="com_tit_new f_22 clr_3").string return number,compname def Date_list_generation(start,end): """ 生成指定日期段的一個(gè)列表 """ datelist = [] datestart = datetime.datetime.strptime(str(start), '%Y-%m-%d') dateend = datetime.datetime.strptime(str(end), '%Y-%m-%d') while datestart < dateend: datestart += datetime.timedelta(days=1) datelist.append(datestart.strftime('%Y-%m-%d')) return datelist def pao(start,end,url): book = Workbook(encoding='utf-8') sheet1 = book.add_sheet('Sheet 1') sheet1.write(0, 0, u'序號') sheet1.write(0, 1, u'監測點(diǎn)位') sheet1.write(0, 2, u'監測時(shí)間') sheet1.write(0, 3, u'監測項目') sheet1.write(0, 4, u'監測結果') sheet1.write(0, 5, u'標準限值') sheet1.write(0, 6, u'單位') sheet1.write(0, 7, u'是否達標') sheet1.write(0, 8, u'超標倍數') sheet1.write(0, 9, u'評價(jià)標準') sheet1.write(0, 10, u'排放去向') sheet1.write(0, 11, u'排放方式') sheet1.write(0, 12, u'備注') datalistnew = [] for date in Date_list_generation(start, end): pagenumber, compname = pageNumber(url, date) for page in range(1, int(pagenumber) + 1): try: datalist = shuju(url, date, page) print date, page time.sleep(0.8) except: print page datalistnew = datalistnew + datalist time.sleep(0.8) datalist = datalistnew for data in range(0, len(datalist)): culumn01 = datalist[data][0] culumn02 = datalist[data][1] culumn03 = datalist[data][2] culumn04 = datalist[data][3] culumn05 = datalist[data][4] culumn06 = datalist[data][5] culumn07 = datalist[data][6] culumn08 = datalist[data][7] culumn09 = datalist[data][8] culumn10 = datalist[data][9] culumn11 = datalist[data][10] culumn12 = datalist[data][11] culumn13 = datalist[data][12] sheet1.write(data + 1, 0, culumn01) sheet1.write(data + 1, 1, culumn02) sheet1.write(data + 1, 2, culumn03) sheet1.write(data + 1, 3, culumn04) sheet1.write(data + 1, 4, culumn05) sheet1.write(data + 1, 5, culumn06) sheet1.write(data + 1, 6, culumn07) sheet1.write(data + 1, 7, culumn08) sheet1.write(data + 1, 8, culumn09) sheet1.write(data + 1, 9, culumn10) sheet1.write(data + 1, 10, culumn11) sheet1.write(data + 1, 11, culumn12) sheet1.write(data + 1, 12, culumn13) tablename = "%s_%s_%s.xls" % (compname, start, end) book.save(tablename) if __name__ == "__main__": start = "2017-05-01" end = "2017-06-01" url = "" pao(start,end,url)&&&&&
免責聲明:本站發(fā)布的內容(圖片、視頻和文字)以原創(chuàng )、來(lái)自互聯(lián)網(wǎng)轉載和分享為主,文章觀(guān)點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權請聯(lián)系站長(cháng)郵箱:ts@56dr.com進(jìn)行舉報,并提供相關(guān)證據,一經(jīng)查實(shí),將立刻刪除涉嫌侵權內容。
Copyright ? 2009-2021 56dr.com. All Rights Reserved. 特網(wǎng)科技 版權所有 珠海市特網(wǎng)科技有限公司 粵ICP備16109289號
域名注冊服務(wù)機構:阿里云計算有限公司(萬(wàn)網(wǎng)) 域名服務(wù)機構:煙臺帝思普網(wǎng)絡(luò )科技有限公司(DNSPod) CDN服務(wù):阿里云計算有限公司 中國互聯(lián)網(wǎng)舉報中心 增值電信業(yè)務(wù)經(jīng)營(yíng)許可證B2 建議您使用Chrome、Firefox、Edge、IE10及以上版本和360等主流瀏覽器瀏覽本網(wǎng)站