发布时间:2019-09-05 07:06:33编辑:auto阅读(2358)
简单小爬虫
#!/usr/bin/env python
#coding:utf-8
import urllib2
import bs4
url = 'http://www.163.com'
content = urllib2.urlopen(url).read()
content = content.decode('gbk')
soup = bs4.BeautifulSoup(content)
links = soup.select('li a[href]')
result = []
for link in links:
href = link.attrs['href']
title = link.text
if '.html' in href and '163.com' in href and len(title) >3:
result.append(link)
for link in result:
print link.attrs['href'], link.text
print '共有新闻[%s]条', len(result)
上一篇: cluster(3)
下一篇: ppp 3
51552
51119
41649
38405
32892
29871
28585
23543
23475
21821
1968°
2672°
2224°
2163°
2609°
2188°
2920°
4845°
4685°
3329°