python爬虫 糗事百科
获取不含图片的糗事内容
python+BeautifulSoup
# -*- coding=utf-8 -*-
#python pullingdata.py 100 data.txt
import requests
from BeautifulSoup import BeautifulSoup
import sys
try:
num=int(sys.argv[1])
out=sys.argv[2]
except Exception,e:
print 'args error',e
j=0
s=requests.Session()
link='http://www.qiushibaike.com/'
r=s.get(link)
with open(out,'w') as file:
pass
while j<=num:
soup=BeautifulSoup(r.text)
txt = soup.findAll('div',{'class':'content'})
res=''
for i in txt:
if i.findNextSibling('div',{'class':'thumb'}) is None:
j+=1
res+= str(j)+'. '+i.contents[0].string.strip()+'\n\n'
with open(out,'a+') as file:
file.write(res.encode('utf-8'))
next=soup.find('a',{'class':'next'})
r=s.get(link[:-1]+next.get('href'))
sunday算法的简单实现
# -*- coding: utf-8 -*-
def sunday(dst,sub):
ld=len(dst)
ls=len(sub)
i=j=0#i表示dst中的起始位置,j表示已经匹配了的长度
while i+j+1<ld:
#print i,j
if dst[i+j]==sub[j]:
if j==ls-1:
return i
else:j+=1
continue
else:
j=0
tmp = find(sub,ls,dst[i+ls])
if tmp==-1:
i=i+ls
else:i=i+ls-tmp
if i+ls>ld:
return -1
return -1
def find(sub,ls,ch):
for i in range(ls-1,-1,-1):
if sub[i]==ch:
return i
return -1
if __name__=="__main__":
d="abcxxxbaaaabaaaxbbaaabcdamno"
s="aaab"
print sunday(d,s)