练习3

发布时间:2019-09-05 07:06:33编辑:auto阅读(1890)

    简单小爬虫


    #!/usr/bin/env python

    #coding:utf-8

    import urllib2

    import bs4

    url = 'http://www.163.com'

    content = urllib2.urlopen(url).read()

    content =  content.decode('gbk')


    soup = bs4.BeautifulSoup(content)

    links = soup.select('li a[href]')


    result = []

    for link in links:

        href = link.attrs['href']

        title = link.text

        if '.html' in href and '163.com' in href and len(title) >3:

            result.append(link)

    for link in result:

        print link.attrs['href'], link.text


    print '共有新闻[%s]条',   len(result)


关键字

上一篇: cluster(3)

下一篇: ppp 3