发布时间:2018-07-11 12:37:05编辑:Run阅读(5736)
Mechanize常用函数
.CookieJar():设置cookie
.Browser():打开浏览器
.addheaders():User-Agent,用来欺骗服务器的
.open():打开网页,按照官网描述可以打开任意网页,不仅限于http
.select_form():选择表单的,选择表单的ID的时候需要注意。
.form[]:填写form表单信息
.submit():提交
环境介绍:
本地有个django服务,访问http://127.0.0.1:8000如下
成功登陆后,会有登陆账号,登陆时间
利用mechanize模拟登陆,在用bs4获取登陆信息
注意:mechanize版本只支持python2x版本
完整代码如下
#!/usr/bin/env python # coding: utf-8 import mechanize import sys from bs4 import BeautifulSoup # py2.7声明使用utf-8编码 reload(sys) sys.setdefaultencoding('utf-8') class Item(object): # 定义一个Item类,爬取的字段类 landing_name = None # 登陆账号 landing_time = None # 登陆时间 class SimulateLogin(object): def __init__(self, url, username, password): self.url = url self.username = username self.password = password self.bs4_filter() def mechanize_setting(self): # 打开浏览器 br = mechanize.Browser() # 设置浏览器 br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_gzip(False) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # 设置user-agent br.addheaders = [('User-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] return br def login(self): br = self.mechanize_setting() br.open(self.url) # 打印form表单需要提交的信息 for form in br.forms(): print(form) # 注意: # post 指的是请求方式 # TextControl(name=)对应的是账号 # PasswordControl(pwd=)对应的是密码 try: br.select_form(method='post') br.form['name'] = self.username br.form['pwd'] = self.password br.submit() except Exception as e: print('form表信息填写错误:%s' % e) else: ret = br.response().read() return ret def bs4_filter(self): items = [] ret = self.login() # 利用bs4 获取登陆成功后的一些信息 soup = BeautifulSoup(ret, 'lxml') tagsli = soup.find_all('ul', attrs={'class': 'nav navbar-nav navbar-right'}) for tag in tagsli: item = Item() # 实例化Item类 item.landing_name = tag.find_all('li')[0].get_text().strip() item.landing_time = tag.find_all('li')[1].get_text().strip() items.append(item) for item in items: print('登陆账号:%s\n登陆时间:%s'%(item.landing_name,item.landing_time)) if __name__ == '__main__': url = 'http://127.0.0.1:8000/' SimulateLogin(url, 'zhangsan', '123')
运行效果:
如果需要改成外网的爬取,代码改成:
#!/usr/bin/env python # coding: utf-8 import mechanize import sys from bs4 import BeautifulSoup # py2.7声明使用utf-8编码 reload(sys) sys.setdefaultencoding('utf-8') class Item(object): # 定义一个Item类,爬取的字段类 landing_name = None # 登陆账号 landing_time = None # 登陆时间 class SimulateLogin(object): def __init__(self, url, username, password): self.url = url self.username = username self.password = password self.login() def mechanize_setting(self): # 打开浏览器 br = mechanize.Browser() # 设置浏览器 br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_gzip(False) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # 设置user-agent br.addheaders = [('User-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] return br def login(self): br = self.mechanize_setting() br.open(self.url) # 打印form表单需要提交的信息 for form in br.forms(): print(form) # 注意: # post 指的是请求方式 # TextControl(name=)对应的是账号 # PasswordControl(pwd=)对应的是密码 try: br.select_form(method='post') br.form['name'] = self.username br.form['pwd'] = self.password br.submit() except Exception as e: print('form表信息填写错误:%s' % e) else: # 读取登陆后页面 ret = br.response().read() print(ret) if __name__ == '__main__': url = 'http://127.0.0.1:8000/' # 后台登陆地址 SimulateLogin(url, 'zhangsan', '123') # 传入url,账号,密码参数
运行结果:可以根据需求去提取对应的数据
47900
46479
37391
34792
29363
26026
24995
19994
19613
18094
5832°
6468°
5976°
5997°
7110°
5947°
5997°
6487°
6451°
7832°